nikunjkdtechnoland commited on
Commit
e041d7d
1 Parent(s): 89c278d

init commit some more add files

Browse files
iopaint/file_manager/storage_backends.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy from https://github.com/silentsokolov/flask-thumbnails/blob/master/flask_thumbnails/storage_backends.py
2
+ import errno
3
+ import os
4
+ from abc import ABC, abstractmethod
5
+
6
+
7
+ class BaseStorageBackend(ABC):
8
+ def __init__(self, app=None):
9
+ self.app = app
10
+
11
+ @abstractmethod
12
+ def read(self, filepath, mode="rb", **kwargs):
13
+ raise NotImplementedError
14
+
15
+ @abstractmethod
16
+ def exists(self, filepath):
17
+ raise NotImplementedError
18
+
19
+ @abstractmethod
20
+ def save(self, filepath, data):
21
+ raise NotImplementedError
22
+
23
+
24
+ class FilesystemStorageBackend(BaseStorageBackend):
25
+ def read(self, filepath, mode="rb", **kwargs):
26
+ with open(filepath, mode) as f: # pylint: disable=unspecified-encoding
27
+ return f.read()
28
+
29
+ def exists(self, filepath):
30
+ return os.path.exists(filepath)
31
+
32
+ def save(self, filepath, data):
33
+ directory = os.path.dirname(filepath)
34
+
35
+ if not os.path.exists(directory):
36
+ try:
37
+ os.makedirs(directory)
38
+ except OSError as e:
39
+ if e.errno != errno.EEXIST:
40
+ raise
41
+
42
+ if not os.path.isdir(directory):
43
+ raise IOError("{} is not a directory".format(directory))
44
+
45
+ with open(filepath, "wb") as f:
46
+ f.write(data)
iopaint/model/anytext/cldm/recognizer.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) Alibaba, Inc. and its affiliates.
3
+ """
4
+ import os
5
+ import cv2
6
+ import numpy as np
7
+ import math
8
+ import traceback
9
+ from easydict import EasyDict as edict
10
+ import time
11
+ from iopaint.model.anytext.ocr_recog.RecModel import RecModel
12
+ import torch
13
+ import torch.nn.functional as F
14
+
15
+
16
+ def min_bounding_rect(img):
17
+ ret, thresh = cv2.threshold(img, 127, 255, 0)
18
+ contours, hierarchy = cv2.findContours(
19
+ thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
20
+ )
21
+ if len(contours) == 0:
22
+ print("Bad contours, using fake bbox...")
23
+ return np.array([[0, 0], [100, 0], [100, 100], [0, 100]])
24
+ max_contour = max(contours, key=cv2.contourArea)
25
+ rect = cv2.minAreaRect(max_contour)
26
+ box = cv2.boxPoints(rect)
27
+ box = np.int0(box)
28
+ # sort
29
+ x_sorted = sorted(box, key=lambda x: x[0])
30
+ left = x_sorted[:2]
31
+ right = x_sorted[2:]
32
+ left = sorted(left, key=lambda x: x[1])
33
+ (tl, bl) = left
34
+ right = sorted(right, key=lambda x: x[1])
35
+ (tr, br) = right
36
+ if tl[1] > bl[1]:
37
+ (tl, bl) = (bl, tl)
38
+ if tr[1] > br[1]:
39
+ (tr, br) = (br, tr)
40
+ return np.array([tl, tr, br, bl])
41
+
42
+
43
+ def create_predictor(model_dir=None, model_lang="ch", is_onnx=False):
44
+ model_file_path = model_dir
45
+ if model_file_path is not None and not os.path.exists(model_file_path):
46
+ raise ValueError("not find model file path {}".format(model_file_path))
47
+
48
+ if is_onnx:
49
+ import onnxruntime as ort
50
+
51
+ sess = ort.InferenceSession(
52
+ model_file_path, providers=["CPUExecutionProvider"]
53
+ ) # 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'
54
+ return sess
55
+ else:
56
+ if model_lang == "ch":
57
+ n_class = 6625
58
+ elif model_lang == "en":
59
+ n_class = 97
60
+ else:
61
+ raise ValueError(f"Unsupported OCR recog model_lang: {model_lang}")
62
+ rec_config = edict(
63
+ in_channels=3,
64
+ backbone=edict(
65
+ type="MobileNetV1Enhance",
66
+ scale=0.5,
67
+ last_conv_stride=[1, 2],
68
+ last_pool_type="avg",
69
+ ),
70
+ neck=edict(
71
+ type="SequenceEncoder",
72
+ encoder_type="svtr",
73
+ dims=64,
74
+ depth=2,
75
+ hidden_dims=120,
76
+ use_guide=True,
77
+ ),
78
+ head=edict(
79
+ type="CTCHead",
80
+ fc_decay=0.00001,
81
+ out_channels=n_class,
82
+ return_feats=True,
83
+ ),
84
+ )
85
+
86
+ rec_model = RecModel(rec_config)
87
+ if model_file_path is not None:
88
+ rec_model.load_state_dict(torch.load(model_file_path, map_location="cpu"))
89
+ rec_model.eval()
90
+ return rec_model.eval()
91
+
92
+
93
+ def _check_image_file(path):
94
+ img_end = {"jpg", "bmp", "png", "jpeg", "rgb", "tif", "tiff"}
95
+ return any([path.lower().endswith(e) for e in img_end])
96
+
97
+
98
+ def get_image_file_list(img_file):
99
+ imgs_lists = []
100
+ if img_file is None or not os.path.exists(img_file):
101
+ raise Exception("not found any img file in {}".format(img_file))
102
+ if os.path.isfile(img_file) and _check_image_file(img_file):
103
+ imgs_lists.append(img_file)
104
+ elif os.path.isdir(img_file):
105
+ for single_file in os.listdir(img_file):
106
+ file_path = os.path.join(img_file, single_file)
107
+ if os.path.isfile(file_path) and _check_image_file(file_path):
108
+ imgs_lists.append(file_path)
109
+ if len(imgs_lists) == 0:
110
+ raise Exception("not found any img file in {}".format(img_file))
111
+ imgs_lists = sorted(imgs_lists)
112
+ return imgs_lists
113
+
114
+
115
+ class TextRecognizer(object):
116
+ def __init__(self, args, predictor):
117
+ self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
118
+ self.rec_batch_num = args.rec_batch_num
119
+ self.predictor = predictor
120
+ self.chars = self.get_char_dict(args.rec_char_dict_path)
121
+ self.char2id = {x: i for i, x in enumerate(self.chars)}
122
+ self.is_onnx = not isinstance(self.predictor, torch.nn.Module)
123
+ self.use_fp16 = args.use_fp16
124
+
125
+ # img: CHW
126
+ def resize_norm_img(self, img, max_wh_ratio):
127
+ imgC, imgH, imgW = self.rec_image_shape
128
+ assert imgC == img.shape[0]
129
+ imgW = int((imgH * max_wh_ratio))
130
+
131
+ h, w = img.shape[1:]
132
+ ratio = w / float(h)
133
+ if math.ceil(imgH * ratio) > imgW:
134
+ resized_w = imgW
135
+ else:
136
+ resized_w = int(math.ceil(imgH * ratio))
137
+ resized_image = torch.nn.functional.interpolate(
138
+ img.unsqueeze(0),
139
+ size=(imgH, resized_w),
140
+ mode="bilinear",
141
+ align_corners=True,
142
+ )
143
+ resized_image /= 255.0
144
+ resized_image -= 0.5
145
+ resized_image /= 0.5
146
+ padding_im = torch.zeros((imgC, imgH, imgW), dtype=torch.float32).to(img.device)
147
+ padding_im[:, :, 0:resized_w] = resized_image[0]
148
+ return padding_im
149
+
150
+ # img_list: list of tensors with shape chw 0-255
151
+ def pred_imglist(self, img_list, show_debug=False, is_ori=False):
152
+ img_num = len(img_list)
153
+ assert img_num > 0
154
+ # Calculate the aspect ratio of all text bars
155
+ width_list = []
156
+ for img in img_list:
157
+ width_list.append(img.shape[2] / float(img.shape[1]))
158
+ # Sorting can speed up the recognition process
159
+ indices = torch.from_numpy(np.argsort(np.array(width_list)))
160
+ batch_num = self.rec_batch_num
161
+ preds_all = [None] * img_num
162
+ preds_neck_all = [None] * img_num
163
+ for beg_img_no in range(0, img_num, batch_num):
164
+ end_img_no = min(img_num, beg_img_no + batch_num)
165
+ norm_img_batch = []
166
+
167
+ imgC, imgH, imgW = self.rec_image_shape[:3]
168
+ max_wh_ratio = imgW / imgH
169
+ for ino in range(beg_img_no, end_img_no):
170
+ h, w = img_list[indices[ino]].shape[1:]
171
+ if h > w * 1.2:
172
+ img = img_list[indices[ino]]
173
+ img = torch.transpose(img, 1, 2).flip(dims=[1])
174
+ img_list[indices[ino]] = img
175
+ h, w = img.shape[1:]
176
+ # wh_ratio = w * 1.0 / h
177
+ # max_wh_ratio = max(max_wh_ratio, wh_ratio) # comment to not use different ratio
178
+ for ino in range(beg_img_no, end_img_no):
179
+ norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
180
+ if self.use_fp16:
181
+ norm_img = norm_img.half()
182
+ norm_img = norm_img.unsqueeze(0)
183
+ norm_img_batch.append(norm_img)
184
+ norm_img_batch = torch.cat(norm_img_batch, dim=0)
185
+ if show_debug:
186
+ for i in range(len(norm_img_batch)):
187
+ _img = norm_img_batch[i].permute(1, 2, 0).detach().cpu().numpy()
188
+ _img = (_img + 0.5) * 255
189
+ _img = _img[:, :, ::-1]
190
+ file_name = f"{indices[beg_img_no + i]}"
191
+ file_name = file_name + "_ori" if is_ori else file_name
192
+ cv2.imwrite(file_name + ".jpg", _img)
193
+ if self.is_onnx:
194
+ input_dict = {}
195
+ input_dict[self.predictor.get_inputs()[0].name] = (
196
+ norm_img_batch.detach().cpu().numpy()
197
+ )
198
+ outputs = self.predictor.run(None, input_dict)
199
+ preds = {}
200
+ preds["ctc"] = torch.from_numpy(outputs[0])
201
+ preds["ctc_neck"] = [torch.zeros(1)] * img_num
202
+ else:
203
+ preds = self.predictor(norm_img_batch)
204
+ for rno in range(preds["ctc"].shape[0]):
205
+ preds_all[indices[beg_img_no + rno]] = preds["ctc"][rno]
206
+ preds_neck_all[indices[beg_img_no + rno]] = preds["ctc_neck"][rno]
207
+
208
+ return torch.stack(preds_all, dim=0), torch.stack(preds_neck_all, dim=0)
209
+
210
+ def get_char_dict(self, character_dict_path):
211
+ character_str = []
212
+ with open(character_dict_path, "rb") as fin:
213
+ lines = fin.readlines()
214
+ for line in lines:
215
+ line = line.decode("utf-8").strip("\n").strip("\r\n")
216
+ character_str.append(line)
217
+ dict_character = list(character_str)
218
+ dict_character = ["sos"] + dict_character + [" "] # eos is space
219
+ return dict_character
220
+
221
+ def get_text(self, order):
222
+ char_list = [self.chars[text_id] for text_id in order]
223
+ return "".join(char_list)
224
+
225
+ def decode(self, mat):
226
+ text_index = mat.detach().cpu().numpy().argmax(axis=1)
227
+ ignored_tokens = [0]
228
+ selection = np.ones(len(text_index), dtype=bool)
229
+ selection[1:] = text_index[1:] != text_index[:-1]
230
+ for ignored_token in ignored_tokens:
231
+ selection &= text_index != ignored_token
232
+ return text_index[selection], np.where(selection)[0]
233
+
234
+ def get_ctcloss(self, preds, gt_text, weight):
235
+ if not isinstance(weight, torch.Tensor):
236
+ weight = torch.tensor(weight).to(preds.device)
237
+ ctc_loss = torch.nn.CTCLoss(reduction="none")
238
+ log_probs = preds.log_softmax(dim=2).permute(1, 0, 2) # NTC-->TNC
239
+ targets = []
240
+ target_lengths = []
241
+ for t in gt_text:
242
+ targets += [self.char2id.get(i, len(self.chars) - 1) for i in t]
243
+ target_lengths += [len(t)]
244
+ targets = torch.tensor(targets).to(preds.device)
245
+ target_lengths = torch.tensor(target_lengths).to(preds.device)
246
+ input_lengths = torch.tensor([log_probs.shape[0]] * (log_probs.shape[1])).to(
247
+ preds.device
248
+ )
249
+ loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
250
+ loss = loss / input_lengths * weight
251
+ return loss
252
+
253
+
254
+ def main():
255
+ rec_model_dir = "./ocr_weights/ppv3_rec.pth"
256
+ predictor = create_predictor(rec_model_dir)
257
+ args = edict()
258
+ args.rec_image_shape = "3, 48, 320"
259
+ args.rec_char_dict_path = "./ocr_weights/ppocr_keys_v1.txt"
260
+ args.rec_batch_num = 6
261
+ text_recognizer = TextRecognizer(args, predictor)
262
+ image_dir = "./test_imgs_cn"
263
+ gt_text = ["韩国小馆"] * 14
264
+
265
+ image_file_list = get_image_file_list(image_dir)
266
+ valid_image_file_list = []
267
+ img_list = []
268
+
269
+ for image_file in image_file_list:
270
+ img = cv2.imread(image_file)
271
+ if img is None:
272
+ print("error in loading image:{}".format(image_file))
273
+ continue
274
+ valid_image_file_list.append(image_file)
275
+ img_list.append(torch.from_numpy(img).permute(2, 0, 1).float())
276
+ try:
277
+ tic = time.time()
278
+ times = []
279
+ for i in range(10):
280
+ preds, _ = text_recognizer.pred_imglist(img_list) # get text
281
+ preds_all = preds.softmax(dim=2)
282
+ times += [(time.time() - tic) * 1000.0]
283
+ tic = time.time()
284
+ print(times)
285
+ print(np.mean(times[1:]) / len(preds_all))
286
+ weight = np.ones(len(gt_text))
287
+ loss = text_recognizer.get_ctcloss(preds, gt_text, weight)
288
+ for i in range(len(valid_image_file_list)):
289
+ pred = preds_all[i]
290
+ order, idx = text_recognizer.decode(pred)
291
+ text = text_recognizer.get_text(order)
292
+ print(
293
+ f'{valid_image_file_list[i]}: pred/gt="{text}"/"{gt_text[i]}", loss={loss[i]:.2f}'
294
+ )
295
+ except Exception as E:
296
+ print(traceback.format_exc(), E)
297
+
298
+
299
+ if __name__ == "__main__":
300
+ main()
iopaint/model/anytext/ldm/models/diffusion/dpm_solver/sampler.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+ import torch
3
+
4
+ from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
5
+
6
+
7
+ MODEL_TYPES = {
8
+ "eps": "noise",
9
+ "v": "v"
10
+ }
11
+
12
+
13
+ class DPMSolverSampler(object):
14
+ def __init__(self, model, **kwargs):
15
+ super().__init__()
16
+ self.model = model
17
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device)
18
+ self.register_buffer('alphas_cumprod', to_torch(model.alphas_cumprod))
19
+
20
+ def register_buffer(self, name, attr):
21
+ if type(attr) == torch.Tensor:
22
+ if attr.device != torch.device("cuda"):
23
+ attr = attr.to(torch.device("cuda"))
24
+ setattr(self, name, attr)
25
+
26
+ @torch.no_grad()
27
+ def sample(self,
28
+ S,
29
+ batch_size,
30
+ shape,
31
+ conditioning=None,
32
+ callback=None,
33
+ normals_sequence=None,
34
+ img_callback=None,
35
+ quantize_x0=False,
36
+ eta=0.,
37
+ mask=None,
38
+ x0=None,
39
+ temperature=1.,
40
+ noise_dropout=0.,
41
+ score_corrector=None,
42
+ corrector_kwargs=None,
43
+ verbose=True,
44
+ x_T=None,
45
+ log_every_t=100,
46
+ unconditional_guidance_scale=1.,
47
+ unconditional_conditioning=None,
48
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
49
+ **kwargs
50
+ ):
51
+ if conditioning is not None:
52
+ if isinstance(conditioning, dict):
53
+ cbs = conditioning[list(conditioning.keys())[0]].shape[0]
54
+ if cbs != batch_size:
55
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
56
+ else:
57
+ if conditioning.shape[0] != batch_size:
58
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
59
+
60
+ # sampling
61
+ C, H, W = shape
62
+ size = (batch_size, C, H, W)
63
+
64
+ print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
65
+
66
+ device = self.model.betas.device
67
+ if x_T is None:
68
+ img = torch.randn(size, device=device)
69
+ else:
70
+ img = x_T
71
+
72
+ ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
73
+
74
+ model_fn = model_wrapper(
75
+ lambda x, t, c: self.model.apply_model(x, t, c),
76
+ ns,
77
+ model_type=MODEL_TYPES[self.model.parameterization],
78
+ guidance_type="classifier-free",
79
+ condition=conditioning,
80
+ unconditional_condition=unconditional_conditioning,
81
+ guidance_scale=unconditional_guidance_scale,
82
+ )
83
+
84
+ dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False)
85
+ x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2, lower_order_final=True)
86
+
87
+ return x.to(device), None
iopaint/model/anytext/ldm/models/diffusion/plms.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from functools import partial
7
+
8
+ from iopaint.model.anytext.ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
9
+ from iopaint.model.anytext.ldm.models.diffusion.sampling_util import norm_thresholding
10
+
11
+
12
+ class PLMSSampler(object):
13
+ def __init__(self, model, schedule="linear", **kwargs):
14
+ super().__init__()
15
+ self.model = model
16
+ self.ddpm_num_timesteps = model.num_timesteps
17
+ self.schedule = schedule
18
+
19
+ def register_buffer(self, name, attr):
20
+ if type(attr) == torch.Tensor:
21
+ if attr.device != torch.device("cuda"):
22
+ attr = attr.to(torch.device("cuda"))
23
+ setattr(self, name, attr)
24
+
25
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
26
+ if ddim_eta != 0:
27
+ raise ValueError('ddim_eta must be 0 for PLMS')
28
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
29
+ num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
30
+ alphas_cumprod = self.model.alphas_cumprod
31
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
32
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
33
+
34
+ self.register_buffer('betas', to_torch(self.model.betas))
35
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
36
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
37
+
38
+ # calculations for diffusion q(x_t | x_{t-1}) and others
39
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
40
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
41
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
42
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
43
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
44
+
45
+ # ddim sampling parameters
46
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
47
+ ddim_timesteps=self.ddim_timesteps,
48
+ eta=ddim_eta,verbose=verbose)
49
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
50
+ self.register_buffer('ddim_alphas', ddim_alphas)
51
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
52
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
53
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
54
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
55
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
56
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
57
+
58
+ @torch.no_grad()
59
+ def sample(self,
60
+ S,
61
+ batch_size,
62
+ shape,
63
+ conditioning=None,
64
+ callback=None,
65
+ normals_sequence=None,
66
+ img_callback=None,
67
+ quantize_x0=False,
68
+ eta=0.,
69
+ mask=None,
70
+ x0=None,
71
+ temperature=1.,
72
+ noise_dropout=0.,
73
+ score_corrector=None,
74
+ corrector_kwargs=None,
75
+ verbose=True,
76
+ x_T=None,
77
+ log_every_t=100,
78
+ unconditional_guidance_scale=1.,
79
+ unconditional_conditioning=None,
80
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
81
+ dynamic_threshold=None,
82
+ **kwargs
83
+ ):
84
+ if conditioning is not None:
85
+ if isinstance(conditioning, dict):
86
+ cbs = conditioning[list(conditioning.keys())[0]].shape[0]
87
+ if cbs != batch_size:
88
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
89
+ else:
90
+ if conditioning.shape[0] != batch_size:
91
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
92
+
93
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
94
+ # sampling
95
+ C, H, W = shape
96
+ size = (batch_size, C, H, W)
97
+ print(f'Data shape for PLMS sampling is {size}')
98
+
99
+ samples, intermediates = self.plms_sampling(conditioning, size,
100
+ callback=callback,
101
+ img_callback=img_callback,
102
+ quantize_denoised=quantize_x0,
103
+ mask=mask, x0=x0,
104
+ ddim_use_original_steps=False,
105
+ noise_dropout=noise_dropout,
106
+ temperature=temperature,
107
+ score_corrector=score_corrector,
108
+ corrector_kwargs=corrector_kwargs,
109
+ x_T=x_T,
110
+ log_every_t=log_every_t,
111
+ unconditional_guidance_scale=unconditional_guidance_scale,
112
+ unconditional_conditioning=unconditional_conditioning,
113
+ dynamic_threshold=dynamic_threshold,
114
+ )
115
+ return samples, intermediates
116
+
117
+ @torch.no_grad()
118
+ def plms_sampling(self, cond, shape,
119
+ x_T=None, ddim_use_original_steps=False,
120
+ callback=None, timesteps=None, quantize_denoised=False,
121
+ mask=None, x0=None, img_callback=None, log_every_t=100,
122
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
123
+ unconditional_guidance_scale=1., unconditional_conditioning=None,
124
+ dynamic_threshold=None):
125
+ device = self.model.betas.device
126
+ b = shape[0]
127
+ if x_T is None:
128
+ img = torch.randn(shape, device=device)
129
+ else:
130
+ img = x_T
131
+
132
+ if timesteps is None:
133
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
134
+ elif timesteps is not None and not ddim_use_original_steps:
135
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
136
+ timesteps = self.ddim_timesteps[:subset_end]
137
+
138
+ intermediates = {'x_inter': [img], 'pred_x0': [img]}
139
+ time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
140
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
141
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
142
+
143
+ iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
144
+ old_eps = []
145
+
146
+ for i, step in enumerate(iterator):
147
+ index = total_steps - i - 1
148
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
149
+ ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
150
+
151
+ if mask is not None:
152
+ assert x0 is not None
153
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
154
+ img = img_orig * mask + (1. - mask) * img
155
+
156
+ outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
157
+ quantize_denoised=quantize_denoised, temperature=temperature,
158
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
159
+ corrector_kwargs=corrector_kwargs,
160
+ unconditional_guidance_scale=unconditional_guidance_scale,
161
+ unconditional_conditioning=unconditional_conditioning,
162
+ old_eps=old_eps, t_next=ts_next,
163
+ dynamic_threshold=dynamic_threshold)
164
+ img, pred_x0, e_t = outs
165
+ old_eps.append(e_t)
166
+ if len(old_eps) >= 4:
167
+ old_eps.pop(0)
168
+ if callback: callback(i)
169
+ if img_callback: img_callback(pred_x0, i)
170
+
171
+ if index % log_every_t == 0 or index == total_steps - 1:
172
+ intermediates['x_inter'].append(img)
173
+ intermediates['pred_x0'].append(pred_x0)
174
+
175
+ return img, intermediates
176
+
177
+ @torch.no_grad()
178
+ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
179
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
180
+ unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None,
181
+ dynamic_threshold=None):
182
+ b, *_, device = *x.shape, x.device
183
+
184
+ def get_model_output(x, t):
185
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
186
+ e_t = self.model.apply_model(x, t, c)
187
+ else:
188
+ x_in = torch.cat([x] * 2)
189
+ t_in = torch.cat([t] * 2)
190
+ c_in = torch.cat([unconditional_conditioning, c])
191
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
192
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
193
+
194
+ if score_corrector is not None:
195
+ assert self.model.parameterization == "eps"
196
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
197
+
198
+ return e_t
199
+
200
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
201
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
202
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
203
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
204
+
205
+ def get_x_prev_and_pred_x0(e_t, index):
206
+ # select parameters corresponding to the currently considered timestep
207
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
208
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
209
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
210
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
211
+
212
+ # current prediction for x_0
213
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
214
+ if quantize_denoised:
215
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
216
+ if dynamic_threshold is not None:
217
+ pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
218
+ # direction pointing to x_t
219
+ dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
220
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
221
+ if noise_dropout > 0.:
222
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
223
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
224
+ return x_prev, pred_x0
225
+
226
+ e_t = get_model_output(x, t)
227
+ if len(old_eps) == 0:
228
+ # Pseudo Improved Euler (2nd order)
229
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
230
+ e_t_next = get_model_output(x_prev, t_next)
231
+ e_t_prime = (e_t + e_t_next) / 2
232
+ elif len(old_eps) == 1:
233
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
234
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
235
+ elif len(old_eps) == 2:
236
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
237
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
238
+ elif len(old_eps) >= 3:
239
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
240
+ e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
241
+
242
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
243
+
244
+ return x_prev, pred_x0, e_t
iopaint/model/anytext/ldm/models/diffusion/sampling_util.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ def append_dims(x, target_dims):
6
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions.
7
+ From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
8
+ dims_to_append = target_dims - x.ndim
9
+ if dims_to_append < 0:
10
+ raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
11
+ return x[(...,) + (None,) * dims_to_append]
12
+
13
+
14
+ def norm_thresholding(x0, value):
15
+ s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
16
+ return x0 * (value / s)
17
+
18
+
19
+ def spatial_norm_thresholding(x0, value):
20
+ # b c h w
21
+ s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
22
+ return x0 * (value / s)
iopaint/model/anytext/ocr_recog/RNN.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ import torch
3
+ from .RecSVTR import Block
4
+
5
+ class Swish(nn.Module):
6
+ def __int__(self):
7
+ super(Swish, self).__int__()
8
+
9
+ def forward(self,x):
10
+ return x*torch.sigmoid(x)
11
+
12
+ class Im2Im(nn.Module):
13
+ def __init__(self, in_channels, **kwargs):
14
+ super().__init__()
15
+ self.out_channels = in_channels
16
+
17
+ def forward(self, x):
18
+ return x
19
+
20
+ class Im2Seq(nn.Module):
21
+ def __init__(self, in_channels, **kwargs):
22
+ super().__init__()
23
+ self.out_channels = in_channels
24
+
25
+ def forward(self, x):
26
+ B, C, H, W = x.shape
27
+ # assert H == 1
28
+ x = x.reshape(B, C, H * W)
29
+ x = x.permute((0, 2, 1))
30
+ return x
31
+
32
+ class EncoderWithRNN(nn.Module):
33
+ def __init__(self, in_channels,**kwargs):
34
+ super(EncoderWithRNN, self).__init__()
35
+ hidden_size = kwargs.get('hidden_size', 256)
36
+ self.out_channels = hidden_size * 2
37
+ self.lstm = nn.LSTM(in_channels, hidden_size, bidirectional=True, num_layers=2,batch_first=True)
38
+
39
+ def forward(self, x):
40
+ self.lstm.flatten_parameters()
41
+ x, _ = self.lstm(x)
42
+ return x
43
+
44
+ class SequenceEncoder(nn.Module):
45
+ def __init__(self, in_channels, encoder_type='rnn', **kwargs):
46
+ super(SequenceEncoder, self).__init__()
47
+ self.encoder_reshape = Im2Seq(in_channels)
48
+ self.out_channels = self.encoder_reshape.out_channels
49
+ self.encoder_type = encoder_type
50
+ if encoder_type == 'reshape':
51
+ self.only_reshape = True
52
+ else:
53
+ support_encoder_dict = {
54
+ 'reshape': Im2Seq,
55
+ 'rnn': EncoderWithRNN,
56
+ 'svtr': EncoderWithSVTR
57
+ }
58
+ assert encoder_type in support_encoder_dict, '{} must in {}'.format(
59
+ encoder_type, support_encoder_dict.keys())
60
+
61
+ self.encoder = support_encoder_dict[encoder_type](
62
+ self.encoder_reshape.out_channels,**kwargs)
63
+ self.out_channels = self.encoder.out_channels
64
+ self.only_reshape = False
65
+
66
+ def forward(self, x):
67
+ if self.encoder_type != 'svtr':
68
+ x = self.encoder_reshape(x)
69
+ if not self.only_reshape:
70
+ x = self.encoder(x)
71
+ return x
72
+ else:
73
+ x = self.encoder(x)
74
+ x = self.encoder_reshape(x)
75
+ return x
76
+
77
+ class ConvBNLayer(nn.Module):
78
+ def __init__(self,
79
+ in_channels,
80
+ out_channels,
81
+ kernel_size=3,
82
+ stride=1,
83
+ padding=0,
84
+ bias_attr=False,
85
+ groups=1,
86
+ act=nn.GELU):
87
+ super().__init__()
88
+ self.conv = nn.Conv2d(
89
+ in_channels=in_channels,
90
+ out_channels=out_channels,
91
+ kernel_size=kernel_size,
92
+ stride=stride,
93
+ padding=padding,
94
+ groups=groups,
95
+ # weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
96
+ bias=bias_attr)
97
+ self.norm = nn.BatchNorm2d(out_channels)
98
+ self.act = Swish()
99
+
100
+ def forward(self, inputs):
101
+ out = self.conv(inputs)
102
+ out = self.norm(out)
103
+ out = self.act(out)
104
+ return out
105
+
106
+
107
+ class EncoderWithSVTR(nn.Module):
108
+ def __init__(
109
+ self,
110
+ in_channels,
111
+ dims=64, # XS
112
+ depth=2,
113
+ hidden_dims=120,
114
+ use_guide=False,
115
+ num_heads=8,
116
+ qkv_bias=True,
117
+ mlp_ratio=2.0,
118
+ drop_rate=0.1,
119
+ attn_drop_rate=0.1,
120
+ drop_path=0.,
121
+ qk_scale=None):
122
+ super(EncoderWithSVTR, self).__init__()
123
+ self.depth = depth
124
+ self.use_guide = use_guide
125
+ self.conv1 = ConvBNLayer(
126
+ in_channels, in_channels // 8, padding=1, act='swish')
127
+ self.conv2 = ConvBNLayer(
128
+ in_channels // 8, hidden_dims, kernel_size=1, act='swish')
129
+
130
+ self.svtr_block = nn.ModuleList([
131
+ Block(
132
+ dim=hidden_dims,
133
+ num_heads=num_heads,
134
+ mixer='Global',
135
+ HW=None,
136
+ mlp_ratio=mlp_ratio,
137
+ qkv_bias=qkv_bias,
138
+ qk_scale=qk_scale,
139
+ drop=drop_rate,
140
+ act_layer='swish',
141
+ attn_drop=attn_drop_rate,
142
+ drop_path=drop_path,
143
+ norm_layer='nn.LayerNorm',
144
+ epsilon=1e-05,
145
+ prenorm=False) for i in range(depth)
146
+ ])
147
+ self.norm = nn.LayerNorm(hidden_dims, eps=1e-6)
148
+ self.conv3 = ConvBNLayer(
149
+ hidden_dims, in_channels, kernel_size=1, act='swish')
150
+ # last conv-nxn, the input is concat of input tensor and conv3 output tensor
151
+ self.conv4 = ConvBNLayer(
152
+ 2 * in_channels, in_channels // 8, padding=1, act='swish')
153
+
154
+ self.conv1x1 = ConvBNLayer(
155
+ in_channels // 8, dims, kernel_size=1, act='swish')
156
+ self.out_channels = dims
157
+ self.apply(self._init_weights)
158
+
159
+ def _init_weights(self, m):
160
+ # weight initialization
161
+ if isinstance(m, nn.Conv2d):
162
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
163
+ if m.bias is not None:
164
+ nn.init.zeros_(m.bias)
165
+ elif isinstance(m, nn.BatchNorm2d):
166
+ nn.init.ones_(m.weight)
167
+ nn.init.zeros_(m.bias)
168
+ elif isinstance(m, nn.Linear):
169
+ nn.init.normal_(m.weight, 0, 0.01)
170
+ if m.bias is not None:
171
+ nn.init.zeros_(m.bias)
172
+ elif isinstance(m, nn.ConvTranspose2d):
173
+ nn.init.kaiming_normal_(m.weight, mode='fan_out')
174
+ if m.bias is not None:
175
+ nn.init.zeros_(m.bias)
176
+ elif isinstance(m, nn.LayerNorm):
177
+ nn.init.ones_(m.weight)
178
+ nn.init.zeros_(m.bias)
179
+
180
+ def forward(self, x):
181
+ # for use guide
182
+ if self.use_guide:
183
+ z = x.clone()
184
+ z.stop_gradient = True
185
+ else:
186
+ z = x
187
+ # for short cut
188
+ h = z
189
+ # reduce dim
190
+ z = self.conv1(z)
191
+ z = self.conv2(z)
192
+ # SVTR global block
193
+ B, C, H, W = z.shape
194
+ z = z.flatten(2).permute(0, 2, 1)
195
+
196
+ for blk in self.svtr_block:
197
+ z = blk(z)
198
+
199
+ z = self.norm(z)
200
+ # last stage
201
+ z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2)
202
+ z = self.conv3(z)
203
+ z = torch.cat((h, z), dim=1)
204
+ z = self.conv1x1(self.conv4(z))
205
+
206
+ return z
207
+
208
+ if __name__=="__main__":
209
+ svtrRNN = EncoderWithSVTR(56)
210
+ print(svtrRNN)
iopaint/model/anytext/ocr_recog/RecCTCHead.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+
4
+ class CTCHead(nn.Module):
5
+ def __init__(self,
6
+ in_channels,
7
+ out_channels=6625,
8
+ fc_decay=0.0004,
9
+ mid_channels=None,
10
+ return_feats=False,
11
+ **kwargs):
12
+ super(CTCHead, self).__init__()
13
+ if mid_channels is None:
14
+ self.fc = nn.Linear(
15
+ in_channels,
16
+ out_channels,
17
+ bias=True,)
18
+ else:
19
+ self.fc1 = nn.Linear(
20
+ in_channels,
21
+ mid_channels,
22
+ bias=True,
23
+ )
24
+ self.fc2 = nn.Linear(
25
+ mid_channels,
26
+ out_channels,
27
+ bias=True,
28
+ )
29
+
30
+ self.out_channels = out_channels
31
+ self.mid_channels = mid_channels
32
+ self.return_feats = return_feats
33
+
34
+ def forward(self, x, labels=None):
35
+ if self.mid_channels is None:
36
+ predicts = self.fc(x)
37
+ else:
38
+ x = self.fc1(x)
39
+ predicts = self.fc2(x)
40
+
41
+ if self.return_feats:
42
+ result = dict()
43
+ result['ctc'] = predicts
44
+ result['ctc_neck'] = x
45
+ else:
46
+ result = predicts
47
+
48
+ return result
iopaint/model/anytext/ocr_recog/RecModel.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from .RNN import SequenceEncoder, Im2Seq, Im2Im
3
+ from .RecMv1_enhance import MobileNetV1Enhance
4
+
5
+ from .RecCTCHead import CTCHead
6
+
7
+ backbone_dict = {"MobileNetV1Enhance":MobileNetV1Enhance}
8
+ neck_dict = {'SequenceEncoder': SequenceEncoder, 'Im2Seq': Im2Seq,'None':Im2Im}
9
+ head_dict = {'CTCHead':CTCHead}
10
+
11
+
12
+ class RecModel(nn.Module):
13
+ def __init__(self, config):
14
+ super().__init__()
15
+ assert 'in_channels' in config, 'in_channels must in model config'
16
+ backbone_type = config.backbone.pop('type')
17
+ assert backbone_type in backbone_dict, f'backbone.type must in {backbone_dict}'
18
+ self.backbone = backbone_dict[backbone_type](config.in_channels, **config.backbone)
19
+
20
+ neck_type = config.neck.pop('type')
21
+ assert neck_type in neck_dict, f'neck.type must in {neck_dict}'
22
+ self.neck = neck_dict[neck_type](self.backbone.out_channels, **config.neck)
23
+
24
+ head_type = config.head.pop('type')
25
+ assert head_type in head_dict, f'head.type must in {head_dict}'
26
+ self.head = head_dict[head_type](self.neck.out_channels, **config.head)
27
+
28
+ self.name = f'RecModel_{backbone_type}_{neck_type}_{head_type}'
29
+
30
+ def load_3rd_state_dict(self, _3rd_name, _state):
31
+ self.backbone.load_3rd_state_dict(_3rd_name, _state)
32
+ self.neck.load_3rd_state_dict(_3rd_name, _state)
33
+ self.head.load_3rd_state_dict(_3rd_name, _state)
34
+
35
+ def forward(self, x):
36
+ x = self.backbone(x)
37
+ x = self.neck(x)
38
+ x = self.head(x)
39
+ return x
40
+
41
+ def encode(self, x):
42
+ x = self.backbone(x)
43
+ x = self.neck(x)
44
+ x = self.head.ctc_encoder(x)
45
+ return x
iopaint/model/anytext/ocr_recog/RecMv1_enhance.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from .common import Activation
5
+
6
+
7
+ class ConvBNLayer(nn.Module):
8
+ def __init__(self,
9
+ num_channels,
10
+ filter_size,
11
+ num_filters,
12
+ stride,
13
+ padding,
14
+ channels=None,
15
+ num_groups=1,
16
+ act='hard_swish'):
17
+ super(ConvBNLayer, self).__init__()
18
+ self.act = act
19
+ self._conv = nn.Conv2d(
20
+ in_channels=num_channels,
21
+ out_channels=num_filters,
22
+ kernel_size=filter_size,
23
+ stride=stride,
24
+ padding=padding,
25
+ groups=num_groups,
26
+ bias=False)
27
+
28
+ self._batch_norm = nn.BatchNorm2d(
29
+ num_filters,
30
+ )
31
+ if self.act is not None:
32
+ self._act = Activation(act_type=act, inplace=True)
33
+
34
+ def forward(self, inputs):
35
+ y = self._conv(inputs)
36
+ y = self._batch_norm(y)
37
+ if self.act is not None:
38
+ y = self._act(y)
39
+ return y
40
+
41
+
42
+ class DepthwiseSeparable(nn.Module):
43
+ def __init__(self,
44
+ num_channels,
45
+ num_filters1,
46
+ num_filters2,
47
+ num_groups,
48
+ stride,
49
+ scale,
50
+ dw_size=3,
51
+ padding=1,
52
+ use_se=False):
53
+ super(DepthwiseSeparable, self).__init__()
54
+ self.use_se = use_se
55
+ self._depthwise_conv = ConvBNLayer(
56
+ num_channels=num_channels,
57
+ num_filters=int(num_filters1 * scale),
58
+ filter_size=dw_size,
59
+ stride=stride,
60
+ padding=padding,
61
+ num_groups=int(num_groups * scale))
62
+ if use_se:
63
+ self._se = SEModule(int(num_filters1 * scale))
64
+ self._pointwise_conv = ConvBNLayer(
65
+ num_channels=int(num_filters1 * scale),
66
+ filter_size=1,
67
+ num_filters=int(num_filters2 * scale),
68
+ stride=1,
69
+ padding=0)
70
+
71
+ def forward(self, inputs):
72
+ y = self._depthwise_conv(inputs)
73
+ if self.use_se:
74
+ y = self._se(y)
75
+ y = self._pointwise_conv(y)
76
+ return y
77
+
78
+
79
+ class MobileNetV1Enhance(nn.Module):
80
+ def __init__(self,
81
+ in_channels=3,
82
+ scale=0.5,
83
+ last_conv_stride=1,
84
+ last_pool_type='max',
85
+ **kwargs):
86
+ super().__init__()
87
+ self.scale = scale
88
+ self.block_list = []
89
+
90
+ self.conv1 = ConvBNLayer(
91
+ num_channels=in_channels,
92
+ filter_size=3,
93
+ channels=3,
94
+ num_filters=int(32 * scale),
95
+ stride=2,
96
+ padding=1)
97
+
98
+ conv2_1 = DepthwiseSeparable(
99
+ num_channels=int(32 * scale),
100
+ num_filters1=32,
101
+ num_filters2=64,
102
+ num_groups=32,
103
+ stride=1,
104
+ scale=scale)
105
+ self.block_list.append(conv2_1)
106
+
107
+ conv2_2 = DepthwiseSeparable(
108
+ num_channels=int(64 * scale),
109
+ num_filters1=64,
110
+ num_filters2=128,
111
+ num_groups=64,
112
+ stride=1,
113
+ scale=scale)
114
+ self.block_list.append(conv2_2)
115
+
116
+ conv3_1 = DepthwiseSeparable(
117
+ num_channels=int(128 * scale),
118
+ num_filters1=128,
119
+ num_filters2=128,
120
+ num_groups=128,
121
+ stride=1,
122
+ scale=scale)
123
+ self.block_list.append(conv3_1)
124
+
125
+ conv3_2 = DepthwiseSeparable(
126
+ num_channels=int(128 * scale),
127
+ num_filters1=128,
128
+ num_filters2=256,
129
+ num_groups=128,
130
+ stride=(2, 1),
131
+ scale=scale)
132
+ self.block_list.append(conv3_2)
133
+
134
+ conv4_1 = DepthwiseSeparable(
135
+ num_channels=int(256 * scale),
136
+ num_filters1=256,
137
+ num_filters2=256,
138
+ num_groups=256,
139
+ stride=1,
140
+ scale=scale)
141
+ self.block_list.append(conv4_1)
142
+
143
+ conv4_2 = DepthwiseSeparable(
144
+ num_channels=int(256 * scale),
145
+ num_filters1=256,
146
+ num_filters2=512,
147
+ num_groups=256,
148
+ stride=(2, 1),
149
+ scale=scale)
150
+ self.block_list.append(conv4_2)
151
+
152
+ for _ in range(5):
153
+ conv5 = DepthwiseSeparable(
154
+ num_channels=int(512 * scale),
155
+ num_filters1=512,
156
+ num_filters2=512,
157
+ num_groups=512,
158
+ stride=1,
159
+ dw_size=5,
160
+ padding=2,
161
+ scale=scale,
162
+ use_se=False)
163
+ self.block_list.append(conv5)
164
+
165
+ conv5_6 = DepthwiseSeparable(
166
+ num_channels=int(512 * scale),
167
+ num_filters1=512,
168
+ num_filters2=1024,
169
+ num_groups=512,
170
+ stride=(2, 1),
171
+ dw_size=5,
172
+ padding=2,
173
+ scale=scale,
174
+ use_se=True)
175
+ self.block_list.append(conv5_6)
176
+
177
+ conv6 = DepthwiseSeparable(
178
+ num_channels=int(1024 * scale),
179
+ num_filters1=1024,
180
+ num_filters2=1024,
181
+ num_groups=1024,
182
+ stride=last_conv_stride,
183
+ dw_size=5,
184
+ padding=2,
185
+ use_se=True,
186
+ scale=scale)
187
+ self.block_list.append(conv6)
188
+
189
+ self.block_list = nn.Sequential(*self.block_list)
190
+ if last_pool_type == 'avg':
191
+ self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
192
+ else:
193
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
194
+ self.out_channels = int(1024 * scale)
195
+
196
+ def forward(self, inputs):
197
+ y = self.conv1(inputs)
198
+ y = self.block_list(y)
199
+ y = self.pool(y)
200
+ return y
201
+
202
+ def hardsigmoid(x):
203
+ return F.relu6(x + 3., inplace=True) / 6.
204
+
205
+ class SEModule(nn.Module):
206
+ def __init__(self, channel, reduction=4):
207
+ super(SEModule, self).__init__()
208
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
209
+ self.conv1 = nn.Conv2d(
210
+ in_channels=channel,
211
+ out_channels=channel // reduction,
212
+ kernel_size=1,
213
+ stride=1,
214
+ padding=0,
215
+ bias=True)
216
+ self.conv2 = nn.Conv2d(
217
+ in_channels=channel // reduction,
218
+ out_channels=channel,
219
+ kernel_size=1,
220
+ stride=1,
221
+ padding=0,
222
+ bias=True)
223
+
224
+ def forward(self, inputs):
225
+ outputs = self.avg_pool(inputs)
226
+ outputs = self.conv1(outputs)
227
+ outputs = F.relu(outputs)
228
+ outputs = self.conv2(outputs)
229
+ outputs = hardsigmoid(outputs)
230
+ x = torch.mul(inputs, outputs)
231
+
232
+ return x
iopaint/model/anytext/ocr_recog/RecSVTR.py ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from torch.nn.init import trunc_normal_, zeros_, ones_
5
+ from torch.nn import functional
6
+
7
+
8
+ def drop_path(x, drop_prob=0., training=False):
9
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
10
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
11
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
12
+ """
13
+ if drop_prob == 0. or not training:
14
+ return x
15
+ keep_prob = torch.tensor(1 - drop_prob)
16
+ shape = (x.size()[0], ) + (1, ) * (x.ndim - 1)
17
+ random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype)
18
+ random_tensor = torch.floor(random_tensor) # binarize
19
+ output = x.divide(keep_prob) * random_tensor
20
+ return output
21
+
22
+
23
+ class Swish(nn.Module):
24
+ def __int__(self):
25
+ super(Swish, self).__int__()
26
+
27
+ def forward(self,x):
28
+ return x*torch.sigmoid(x)
29
+
30
+
31
+ class ConvBNLayer(nn.Module):
32
+ def __init__(self,
33
+ in_channels,
34
+ out_channels,
35
+ kernel_size=3,
36
+ stride=1,
37
+ padding=0,
38
+ bias_attr=False,
39
+ groups=1,
40
+ act=nn.GELU):
41
+ super().__init__()
42
+ self.conv = nn.Conv2d(
43
+ in_channels=in_channels,
44
+ out_channels=out_channels,
45
+ kernel_size=kernel_size,
46
+ stride=stride,
47
+ padding=padding,
48
+ groups=groups,
49
+ # weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
50
+ bias=bias_attr)
51
+ self.norm = nn.BatchNorm2d(out_channels)
52
+ self.act = act()
53
+
54
+ def forward(self, inputs):
55
+ out = self.conv(inputs)
56
+ out = self.norm(out)
57
+ out = self.act(out)
58
+ return out
59
+
60
+
61
+ class DropPath(nn.Module):
62
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
63
+ """
64
+
65
+ def __init__(self, drop_prob=None):
66
+ super(DropPath, self).__init__()
67
+ self.drop_prob = drop_prob
68
+
69
+ def forward(self, x):
70
+ return drop_path(x, self.drop_prob, self.training)
71
+
72
+
73
+ class Identity(nn.Module):
74
+ def __init__(self):
75
+ super(Identity, self).__init__()
76
+
77
+ def forward(self, input):
78
+ return input
79
+
80
+
81
+ class Mlp(nn.Module):
82
+ def __init__(self,
83
+ in_features,
84
+ hidden_features=None,
85
+ out_features=None,
86
+ act_layer=nn.GELU,
87
+ drop=0.):
88
+ super().__init__()
89
+ out_features = out_features or in_features
90
+ hidden_features = hidden_features or in_features
91
+ self.fc1 = nn.Linear(in_features, hidden_features)
92
+ if isinstance(act_layer, str):
93
+ self.act = Swish()
94
+ else:
95
+ self.act = act_layer()
96
+ self.fc2 = nn.Linear(hidden_features, out_features)
97
+ self.drop = nn.Dropout(drop)
98
+
99
+ def forward(self, x):
100
+ x = self.fc1(x)
101
+ x = self.act(x)
102
+ x = self.drop(x)
103
+ x = self.fc2(x)
104
+ x = self.drop(x)
105
+ return x
106
+
107
+
108
+ class ConvMixer(nn.Module):
109
+ def __init__(
110
+ self,
111
+ dim,
112
+ num_heads=8,
113
+ HW=(8, 25),
114
+ local_k=(3, 3), ):
115
+ super().__init__()
116
+ self.HW = HW
117
+ self.dim = dim
118
+ self.local_mixer = nn.Conv2d(
119
+ dim,
120
+ dim,
121
+ local_k,
122
+ 1, (local_k[0] // 2, local_k[1] // 2),
123
+ groups=num_heads,
124
+ # weight_attr=ParamAttr(initializer=KaimingNormal())
125
+ )
126
+
127
+ def forward(self, x):
128
+ h = self.HW[0]
129
+ w = self.HW[1]
130
+ x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
131
+ x = self.local_mixer(x)
132
+ x = x.flatten(2).transpose([0, 2, 1])
133
+ return x
134
+
135
+
136
+ class Attention(nn.Module):
137
+ def __init__(self,
138
+ dim,
139
+ num_heads=8,
140
+ mixer='Global',
141
+ HW=(8, 25),
142
+ local_k=(7, 11),
143
+ qkv_bias=False,
144
+ qk_scale=None,
145
+ attn_drop=0.,
146
+ proj_drop=0.):
147
+ super().__init__()
148
+ self.num_heads = num_heads
149
+ head_dim = dim // num_heads
150
+ self.scale = qk_scale or head_dim**-0.5
151
+
152
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
153
+ self.attn_drop = nn.Dropout(attn_drop)
154
+ self.proj = nn.Linear(dim, dim)
155
+ self.proj_drop = nn.Dropout(proj_drop)
156
+ self.HW = HW
157
+ if HW is not None:
158
+ H = HW[0]
159
+ W = HW[1]
160
+ self.N = H * W
161
+ self.C = dim
162
+ if mixer == 'Local' and HW is not None:
163
+ hk = local_k[0]
164
+ wk = local_k[1]
165
+ mask = torch.ones([H * W, H + hk - 1, W + wk - 1])
166
+ for h in range(0, H):
167
+ for w in range(0, W):
168
+ mask[h * W + w, h:h + hk, w:w + wk] = 0.
169
+ mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
170
+ 2].flatten(1)
171
+ mask_inf = torch.full([H * W, H * W],fill_value=float('-inf'))
172
+ mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf)
173
+ self.mask = mask[None,None,:]
174
+ # self.mask = mask.unsqueeze([0, 1])
175
+ self.mixer = mixer
176
+
177
+ def forward(self, x):
178
+ if self.HW is not None:
179
+ N = self.N
180
+ C = self.C
181
+ else:
182
+ _, N, C = x.shape
183
+ qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //self.num_heads)).permute((2, 0, 3, 1, 4))
184
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
185
+
186
+ attn = (q.matmul(k.permute((0, 1, 3, 2))))
187
+ if self.mixer == 'Local':
188
+ attn += self.mask
189
+ attn = functional.softmax(attn, dim=-1)
190
+ attn = self.attn_drop(attn)
191
+
192
+ x = (attn.matmul(v)).permute((0, 2, 1, 3)).reshape((-1, N, C))
193
+ x = self.proj(x)
194
+ x = self.proj_drop(x)
195
+ return x
196
+
197
+
198
+ class Block(nn.Module):
199
+ def __init__(self,
200
+ dim,
201
+ num_heads,
202
+ mixer='Global',
203
+ local_mixer=(7, 11),
204
+ HW=(8, 25),
205
+ mlp_ratio=4.,
206
+ qkv_bias=False,
207
+ qk_scale=None,
208
+ drop=0.,
209
+ attn_drop=0.,
210
+ drop_path=0.,
211
+ act_layer=nn.GELU,
212
+ norm_layer='nn.LayerNorm',
213
+ epsilon=1e-6,
214
+ prenorm=True):
215
+ super().__init__()
216
+ if isinstance(norm_layer, str):
217
+ self.norm1 = eval(norm_layer)(dim, eps=epsilon)
218
+ else:
219
+ self.norm1 = norm_layer(dim)
220
+ if mixer == 'Global' or mixer == 'Local':
221
+
222
+ self.mixer = Attention(
223
+ dim,
224
+ num_heads=num_heads,
225
+ mixer=mixer,
226
+ HW=HW,
227
+ local_k=local_mixer,
228
+ qkv_bias=qkv_bias,
229
+ qk_scale=qk_scale,
230
+ attn_drop=attn_drop,
231
+ proj_drop=drop)
232
+ elif mixer == 'Conv':
233
+ self.mixer = ConvMixer(
234
+ dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
235
+ else:
236
+ raise TypeError("The mixer must be one of [Global, Local, Conv]")
237
+
238
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
239
+ if isinstance(norm_layer, str):
240
+ self.norm2 = eval(norm_layer)(dim, eps=epsilon)
241
+ else:
242
+ self.norm2 = norm_layer(dim)
243
+ mlp_hidden_dim = int(dim * mlp_ratio)
244
+ self.mlp_ratio = mlp_ratio
245
+ self.mlp = Mlp(in_features=dim,
246
+ hidden_features=mlp_hidden_dim,
247
+ act_layer=act_layer,
248
+ drop=drop)
249
+ self.prenorm = prenorm
250
+
251
+ def forward(self, x):
252
+ if self.prenorm:
253
+ x = self.norm1(x + self.drop_path(self.mixer(x)))
254
+ x = self.norm2(x + self.drop_path(self.mlp(x)))
255
+ else:
256
+ x = x + self.drop_path(self.mixer(self.norm1(x)))
257
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
258
+ return x
259
+
260
+
261
+ class PatchEmbed(nn.Module):
262
+ """ Image to Patch Embedding
263
+ """
264
+
265
+ def __init__(self,
266
+ img_size=(32, 100),
267
+ in_channels=3,
268
+ embed_dim=768,
269
+ sub_num=2):
270
+ super().__init__()
271
+ num_patches = (img_size[1] // (2 ** sub_num)) * \
272
+ (img_size[0] // (2 ** sub_num))
273
+ self.img_size = img_size
274
+ self.num_patches = num_patches
275
+ self.embed_dim = embed_dim
276
+ self.norm = None
277
+ if sub_num == 2:
278
+ self.proj = nn.Sequential(
279
+ ConvBNLayer(
280
+ in_channels=in_channels,
281
+ out_channels=embed_dim // 2,
282
+ kernel_size=3,
283
+ stride=2,
284
+ padding=1,
285
+ act=nn.GELU,
286
+ bias_attr=False),
287
+ ConvBNLayer(
288
+ in_channels=embed_dim // 2,
289
+ out_channels=embed_dim,
290
+ kernel_size=3,
291
+ stride=2,
292
+ padding=1,
293
+ act=nn.GELU,
294
+ bias_attr=False))
295
+ if sub_num == 3:
296
+ self.proj = nn.Sequential(
297
+ ConvBNLayer(
298
+ in_channels=in_channels,
299
+ out_channels=embed_dim // 4,
300
+ kernel_size=3,
301
+ stride=2,
302
+ padding=1,
303
+ act=nn.GELU,
304
+ bias_attr=False),
305
+ ConvBNLayer(
306
+ in_channels=embed_dim // 4,
307
+ out_channels=embed_dim // 2,
308
+ kernel_size=3,
309
+ stride=2,
310
+ padding=1,
311
+ act=nn.GELU,
312
+ bias_attr=False),
313
+ ConvBNLayer(
314
+ in_channels=embed_dim // 2,
315
+ out_channels=embed_dim,
316
+ kernel_size=3,
317
+ stride=2,
318
+ padding=1,
319
+ act=nn.GELU,
320
+ bias_attr=False))
321
+
322
+ def forward(self, x):
323
+ B, C, H, W = x.shape
324
+ assert H == self.img_size[0] and W == self.img_size[1], \
325
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
326
+ x = self.proj(x).flatten(2).permute(0, 2, 1)
327
+ return x
328
+
329
+
330
+ class SubSample(nn.Module):
331
+ def __init__(self,
332
+ in_channels,
333
+ out_channels,
334
+ types='Pool',
335
+ stride=(2, 1),
336
+ sub_norm='nn.LayerNorm',
337
+ act=None):
338
+ super().__init__()
339
+ self.types = types
340
+ if types == 'Pool':
341
+ self.avgpool = nn.AvgPool2d(
342
+ kernel_size=(3, 5), stride=stride, padding=(1, 2))
343
+ self.maxpool = nn.MaxPool2d(
344
+ kernel_size=(3, 5), stride=stride, padding=(1, 2))
345
+ self.proj = nn.Linear(in_channels, out_channels)
346
+ else:
347
+ self.conv = nn.Conv2d(
348
+ in_channels,
349
+ out_channels,
350
+ kernel_size=3,
351
+ stride=stride,
352
+ padding=1,
353
+ # weight_attr=ParamAttr(initializer=KaimingNormal())
354
+ )
355
+ self.norm = eval(sub_norm)(out_channels)
356
+ if act is not None:
357
+ self.act = act()
358
+ else:
359
+ self.act = None
360
+
361
+ def forward(self, x):
362
+
363
+ if self.types == 'Pool':
364
+ x1 = self.avgpool(x)
365
+ x2 = self.maxpool(x)
366
+ x = (x1 + x2) * 0.5
367
+ out = self.proj(x.flatten(2).permute((0, 2, 1)))
368
+ else:
369
+ x = self.conv(x)
370
+ out = x.flatten(2).permute((0, 2, 1))
371
+ out = self.norm(out)
372
+ if self.act is not None:
373
+ out = self.act(out)
374
+
375
+ return out
376
+
377
+
378
+ class SVTRNet(nn.Module):
379
+ def __init__(
380
+ self,
381
+ img_size=[48, 100],
382
+ in_channels=3,
383
+ embed_dim=[64, 128, 256],
384
+ depth=[3, 6, 3],
385
+ num_heads=[2, 4, 8],
386
+ mixer=['Local'] * 6 + ['Global'] *
387
+ 6, # Local atten, Global atten, Conv
388
+ local_mixer=[[7, 11], [7, 11], [7, 11]],
389
+ patch_merging='Conv', # Conv, Pool, None
390
+ mlp_ratio=4,
391
+ qkv_bias=True,
392
+ qk_scale=None,
393
+ drop_rate=0.,
394
+ last_drop=0.1,
395
+ attn_drop_rate=0.,
396
+ drop_path_rate=0.1,
397
+ norm_layer='nn.LayerNorm',
398
+ sub_norm='nn.LayerNorm',
399
+ epsilon=1e-6,
400
+ out_channels=192,
401
+ out_char_num=25,
402
+ block_unit='Block',
403
+ act='nn.GELU',
404
+ last_stage=True,
405
+ sub_num=2,
406
+ prenorm=True,
407
+ use_lenhead=False,
408
+ **kwargs):
409
+ super().__init__()
410
+ self.img_size = img_size
411
+ self.embed_dim = embed_dim
412
+ self.out_channels = out_channels
413
+ self.prenorm = prenorm
414
+ patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
415
+ self.patch_embed = PatchEmbed(
416
+ img_size=img_size,
417
+ in_channels=in_channels,
418
+ embed_dim=embed_dim[0],
419
+ sub_num=sub_num)
420
+ num_patches = self.patch_embed.num_patches
421
+ self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
422
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0]))
423
+ # self.pos_embed = self.create_parameter(
424
+ # shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
425
+
426
+ # self.add_parameter("pos_embed", self.pos_embed)
427
+
428
+ self.pos_drop = nn.Dropout(p=drop_rate)
429
+ Block_unit = eval(block_unit)
430
+
431
+ dpr = np.linspace(0, drop_path_rate, sum(depth))
432
+ self.blocks1 = nn.ModuleList(
433
+ [
434
+ Block_unit(
435
+ dim=embed_dim[0],
436
+ num_heads=num_heads[0],
437
+ mixer=mixer[0:depth[0]][i],
438
+ HW=self.HW,
439
+ local_mixer=local_mixer[0],
440
+ mlp_ratio=mlp_ratio,
441
+ qkv_bias=qkv_bias,
442
+ qk_scale=qk_scale,
443
+ drop=drop_rate,
444
+ act_layer=eval(act),
445
+ attn_drop=attn_drop_rate,
446
+ drop_path=dpr[0:depth[0]][i],
447
+ norm_layer=norm_layer,
448
+ epsilon=epsilon,
449
+ prenorm=prenorm) for i in range(depth[0])
450
+ ]
451
+ )
452
+ if patch_merging is not None:
453
+ self.sub_sample1 = SubSample(
454
+ embed_dim[0],
455
+ embed_dim[1],
456
+ sub_norm=sub_norm,
457
+ stride=[2, 1],
458
+ types=patch_merging)
459
+ HW = [self.HW[0] // 2, self.HW[1]]
460
+ else:
461
+ HW = self.HW
462
+ self.patch_merging = patch_merging
463
+ self.blocks2 = nn.ModuleList([
464
+ Block_unit(
465
+ dim=embed_dim[1],
466
+ num_heads=num_heads[1],
467
+ mixer=mixer[depth[0]:depth[0] + depth[1]][i],
468
+ HW=HW,
469
+ local_mixer=local_mixer[1],
470
+ mlp_ratio=mlp_ratio,
471
+ qkv_bias=qkv_bias,
472
+ qk_scale=qk_scale,
473
+ drop=drop_rate,
474
+ act_layer=eval(act),
475
+ attn_drop=attn_drop_rate,
476
+ drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
477
+ norm_layer=norm_layer,
478
+ epsilon=epsilon,
479
+ prenorm=prenorm) for i in range(depth[1])
480
+ ])
481
+ if patch_merging is not None:
482
+ self.sub_sample2 = SubSample(
483
+ embed_dim[1],
484
+ embed_dim[2],
485
+ sub_norm=sub_norm,
486
+ stride=[2, 1],
487
+ types=patch_merging)
488
+ HW = [self.HW[0] // 4, self.HW[1]]
489
+ else:
490
+ HW = self.HW
491
+ self.blocks3 = nn.ModuleList([
492
+ Block_unit(
493
+ dim=embed_dim[2],
494
+ num_heads=num_heads[2],
495
+ mixer=mixer[depth[0] + depth[1]:][i],
496
+ HW=HW,
497
+ local_mixer=local_mixer[2],
498
+ mlp_ratio=mlp_ratio,
499
+ qkv_bias=qkv_bias,
500
+ qk_scale=qk_scale,
501
+ drop=drop_rate,
502
+ act_layer=eval(act),
503
+ attn_drop=attn_drop_rate,
504
+ drop_path=dpr[depth[0] + depth[1]:][i],
505
+ norm_layer=norm_layer,
506
+ epsilon=epsilon,
507
+ prenorm=prenorm) for i in range(depth[2])
508
+ ])
509
+ self.last_stage = last_stage
510
+ if last_stage:
511
+ self.avg_pool = nn.AdaptiveAvgPool2d((1, out_char_num))
512
+ self.last_conv = nn.Conv2d(
513
+ in_channels=embed_dim[2],
514
+ out_channels=self.out_channels,
515
+ kernel_size=1,
516
+ stride=1,
517
+ padding=0,
518
+ bias=False)
519
+ self.hardswish = nn.Hardswish()
520
+ self.dropout = nn.Dropout(p=last_drop)
521
+ if not prenorm:
522
+ self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
523
+ self.use_lenhead = use_lenhead
524
+ if use_lenhead:
525
+ self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
526
+ self.hardswish_len = nn.Hardswish()
527
+ self.dropout_len = nn.Dropout(
528
+ p=last_drop)
529
+
530
+ trunc_normal_(self.pos_embed,std=.02)
531
+ self.apply(self._init_weights)
532
+
533
+ def _init_weights(self, m):
534
+ if isinstance(m, nn.Linear):
535
+ trunc_normal_(m.weight,std=.02)
536
+ if isinstance(m, nn.Linear) and m.bias is not None:
537
+ zeros_(m.bias)
538
+ elif isinstance(m, nn.LayerNorm):
539
+ zeros_(m.bias)
540
+ ones_(m.weight)
541
+
542
+ def forward_features(self, x):
543
+ x = self.patch_embed(x)
544
+ x = x + self.pos_embed
545
+ x = self.pos_drop(x)
546
+ for blk in self.blocks1:
547
+ x = blk(x)
548
+ if self.patch_merging is not None:
549
+ x = self.sub_sample1(
550
+ x.permute([0, 2, 1]).reshape(
551
+ [-1, self.embed_dim[0], self.HW[0], self.HW[1]]))
552
+ for blk in self.blocks2:
553
+ x = blk(x)
554
+ if self.patch_merging is not None:
555
+ x = self.sub_sample2(
556
+ x.permute([0, 2, 1]).reshape(
557
+ [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
558
+ for blk in self.blocks3:
559
+ x = blk(x)
560
+ if not self.prenorm:
561
+ x = self.norm(x)
562
+ return x
563
+
564
+ def forward(self, x):
565
+ x = self.forward_features(x)
566
+ if self.use_lenhead:
567
+ len_x = self.len_conv(x.mean(1))
568
+ len_x = self.dropout_len(self.hardswish_len(len_x))
569
+ if self.last_stage:
570
+ if self.patch_merging is not None:
571
+ h = self.HW[0] // 4
572
+ else:
573
+ h = self.HW[0]
574
+ x = self.avg_pool(
575
+ x.permute([0, 2, 1]).reshape(
576
+ [-1, self.embed_dim[2], h, self.HW[1]]))
577
+ x = self.last_conv(x)
578
+ x = self.hardswish(x)
579
+ x = self.dropout(x)
580
+ if self.use_lenhead:
581
+ return x, len_x
582
+ return x
583
+
584
+
585
+ if __name__=="__main__":
586
+ a = torch.rand(1,3,48,100)
587
+ svtr = SVTRNet()
588
+
589
+ out = svtr(a)
590
+ print(svtr)
591
+ print(out.size())
iopaint/model/anytext/ocr_recog/ppocr_keys_v1.txt ADDED
@@ -0,0 +1,6623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '
2
+
3
+
4
+
5
+
6
+
7
+
8
+ 贿
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+ 2
26
+ 0
27
+ 8
28
+ -
29
+ 7
30
+
31
+ >
32
+ :
33
+ ]
34
+ ,
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+ 蹿
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+ 1
94
+ 3
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+ !
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+ 诿
305
+
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+ 线
317
+
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
391
+
392
+ 尿
393
+
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+ |
402
+ ;
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+ H
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
452
+
453
+
454
+
455
+
456
+
457
+
458
+
459
+
460
+
461
+
462
+
463
+
464
+
465
+
466
+ .
467
+
468
+
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+
485
+
486
+
487
+ /
488
+ *
489
+
490
+ 忿
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+
503
+
504
+ 齿
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
530
+
531
+
532
+
533
+
534
+
535
+
536
+
537
+
538
+
539
+
540
+
541
+
542
+
543
+
544
+
545
+
546
+
547
+
548
+
549
+
550
+
551
+
552
+
553
+
554
+
555
+
556
+
557
+
558
+
559
+
560
+
561
+
562
+
563
+
564
+
565
+
566
+
567
+
568
+
569
+
570
+
571
+ 西
572
+
573
+
574
+
575
+
576
+
577
+
578
+
579
+
580
+
581
+
582
+
583
+
584
+
585
+
586
+
587
+
588
+
589
+
590
+
591
+
592
+
593
+
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+
602
+
603
+
604
+
605
+
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+
615
+
616
+
617
+
618
+
619
+
620
+
621
+
622
+
623
+
624
+
625
+
626
+
627
+
628
+
629
+
630
+
631
+ 5
632
+ 4
633
+
634
+
635
+
636
+
637
+
638
+
639
+
640
+
641
+
642
+
643
+
644
+
645
+
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+
654
+
655
+
656
+ 亿
657
+
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+
666
+
667
+
668
+
669
+
670
+
671
+
672
+
673
+
674
+
675
+
676
+
677
+
678
+
679
+
680
+
681
+
682
+
683
+
684
+
685
+
686
+
687
+
688
+
689
+
690
+
691
+
692
+
693
+
694
+
695
+
696
+
697
+
698
+
699
+
700
+
701
+
702
+
703
+
704
+
705
+
706
+
707
+
708
+
709
+
710
+
711
+
712
+
713
+
714
+
715
+
716
+
717
+
718
+
719
+
720
+
721
+ (
722
+
723
+
724
+
725
+
726
+
727
+
728
+
729
+
730
+
731
+
732
+
733
+
734
+
735
+
736
+
737
+
738
+
739
+
740
+
741
+
742
+
743
+
744
+
745
+
746
+
747
+
748
+
749
+
750
+
751
+
752
+
753
+
754
+ 访
755
+
756
+
757
+
758
+
759
+
760
+
761
+
762
+
763
+
764
+
765
+
766
+
767
+
768
+
769
+
770
+
771
+
772
+
773
+
774
+
775
+
776
+
777
+
778
+
779
+
780
+
781
+
782
+
783
+
784
+
785
+
786
+
787
+
788
+
789
+
790
+
791
+
792
+
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+
818
+
819
+
820
+
821
+
822
+
823
+
824
+
825
+
826
+
827
+
828
+
829
+
830
+
831
+
832
+
833
+
834
+
835
+
836
+
837
+
838
+
839
+
840
+
841
+
842
+
843
+
844
+
845
+
846
+
847
+
848
+
849
+
850
+
851
+
852
+
853
+
854
+
855
+
856
+
857
+
858
+
859
+
860
+
861
+
862
+
863
+
864
+
865
+
866
+
867
+
868
+
869
+
870
+
871
+
872
+
873
+
874
+
875
+
876
+
877
+
878
+
879
+
880
+
881
+
882
+
883
+
884
+
885
+
886
+
887
+
888
+
889
+
890
+
891
+
892
+
893
+
894
+
895
+
896
+
897
+
898
+
899
+
900
+
901
+
902
+
903
+
904
+
905
+
906
+
907
+
908
+
909
+
910
+
911
+
912
+
913
+
914
+
915
+
916
+
917
+
918
+
919
+
920
+
921
+
922
+
923
+
924
+
925
+
926
+
927
+
928
+
929
+
930
+
931
+
932
+
933
+ 6
934
+
935
+
936
+
937
+
938
+
939
+
940
+
941
+
942
+
943
+
944
+
945
+
946
+
947
+
948
+
949
+
950
+
951
+
952
+
953
+
954
+
955
+
956
+
957
+
958
+
959
+
960
+
961
+
962
+
963
+
964
+
965
+ )
966
+
967
+
968
+
969
+
970
+
971
+
972
+
973
+
974
+
975
+
976
+
977
+
978
+
979
+
980
+
981
+
982
+
983
+
984
+
985
+
986
+
987
+
988
+
989
+ 稿
990
+
991
+
992
+
993
+
994
+
995
+
996
+
997
+
998
+
999
+
1000
+
1001
+
1002
+
1003
+
1004
+
1005
+
1006
+
1007
+
1008
+
1009
+
1010
+
1011
+
1012
+
1013
+
1014
+
1015
+
1016
+
1017
+
1018
+
1019
+
1020
+
1021
+
1022
+
1023
+
1024
+
1025
+
1026
+
1027
+
1028
+
1029
+
1030
+
1031
+
1032
+
1033
+ s
1034
+ u
1035
+
1036
+
1037
+
1038
+
1039
+
1040
+
1041
+
1042
+
1043
+
1044
+
1045
+
1046
+
1047
+
1048
+
1049
+
1050
+
1051
+
1052
+
1053
+
1054
+
1055
+
1056
+
1057
+
1058
+
1059
+
1060
+
1061
+
1062
+
1063
+
1064
+
1065
+
1066
+
1067
+
1068
+
1069
+
1070
+
1071
+
1072
+
1073
+
1074
+
1075
+
1076
+
1077
+
1078
+
1079
+
1080
+
1081
+
1082
+
1083
+
1084
+
1085
+
1086
+
1087
+
1088
+
1089
+
1090
+
1091
+
1092
+
1093
+
1094
+
1095
+
1096
+
1097
+
1098
+
1099
+
1100
+
1101
+
1102
+
1103
+
1104
+
1105
+ [
1106
+
1107
+
1108
+
1109
+ 9
1110
+
1111
+
1112
+
1113
+
1114
+
1115
+
1116
+
1117
+
1118
+
1119
+
1120
+
1121
+
1122
+
1123
+
1124
+
1125
+
1126
+
1127
+
1128
+
1129
+
1130
+
1131
+
1132
+
1133
+
1134
+
1135
+
1136
+
1137
+
1138
+
1139
+
1140
+
1141
+
1142
+
1143
+
1144
+
1145
+
1146
+
1147
+
1148
+
1149
+
1150
+
1151
+
1152
+
1153
+
1154
+
1155
+
1156
+
1157
+
1158
+
1159
+
1160
+
1161
+
1162
+
1163
+
1164
+
1165
+
1166
+
1167
+
1168
+
1169
+
1170
+
1171
+
1172
+
1173
+
1174
+
1175
+
1176
+
1177
+
1178
+
1179
+
1180
+
1181
+
1182
+ 岿
1183
+
1184
+
1185
+
1186
+
1187
+
1188
+
1189
+
1190
+
1191
+
1192
+
1193
+
1194
+
1195
+
1196
+
1197
+
1198
+
1199
+
1200
+ 广
1201
+
1202
+
1203
+
1204
+
1205
+
1206
+
1207
+
1208
+
1209
+
1210
+
1211
+
1212
+
1213
+
1214
+
1215
+
1216
+
1217
+ S
1218
+ Y
1219
+ F
1220
+ D
1221
+ A
1222
+
1223
+
1224
+
1225
+
1226
+
1227
+
1228
+
1229
+
1230
+
1231
+
1232
+
1233
+
1234
+
1235
+
1236
+
1237
+
1238
+
1239
+
1240
+
1241
+
1242
+
1243
+
1244
+
1245
+
1246
+
1247
+
1248
+
1249
+
1250
+
1251
+
1252
+
1253
+
1254
+
1255
+
1256
+
1257
+
1258
+
1259
+
1260
+
1261
+
1262
+
1263
+
1264
+
1265
+
1266
+
1267
+
1268
+
1269
+
1270
+
1271
+
1272
+
1273
+
1274
+
1275
+
1276
+
1277
+
1278
+
1279
+
1280
+
1281
+
1282
+
1283
+
1284
+
1285
+
1286
+
1287
+
1288
+
1289
+
1290
+
1291
+
1292
+
1293
+
1294
+
1295
+
1296
+
1297
+
1298
+
1299
+
1300
+
1301
+
1302
+
1303
+
1304
+
1305
+
1306
+
1307
+
1308
+
1309
+
1310
+ P
1311
+
1312
+
1313
+
1314
+
1315
+
1316
+
1317
+
1318
+
1319
+
1320
+
1321
+
1322
+
1323
+
1324
+
1325
+
1326
+
1327
+
1328
+
1329
+
1330
+
1331
+
1332
+
1333
+
1334
+
1335
+
1336
+
1337
+
1338
+
1339
+
1340
+
1341
+
1342
+
1343
+
1344
+
1345
+
1346
+
1347
+
1348
+
1349
+
1350
+
1351
+
1352
+
1353
+
1354
+
1355
+
1356
+
1357
+
1358
+
1359
+
1360
+
1361
+
1362
+
1363
+
1364
+
1365
+
1366
+
1367
+
1368
+
1369
+
1370
+
1371
+
1372
+
1373
+
1374
+
1375
+
1376
+
1377
+
1378
+
1379
+
1380
+
1381
+ T
1382
+
1383
+
1384
+
1385
+
1386
+ 湿
1387
+
1388
+
1389
+
1390
+
1391
+
1392
+
1393
+
1394
+
1395
+
1396
+
1397
+ 窿
1398
+
1399
+
1400
+
1401
+
1402
+
1403
+
1404
+
1405
+
1406
+
1407
+
1408
+
1409
+
1410
+
1411
+
1412
+
1413
+
1414
+
1415
+
1416
+
1417
+
1418
+
1419
+
1420
+
1421
+
1422
+
1423
+
1424
+
1425
+
1426
+
1427
+
1428
+
1429
+
1430
+
1431
+
1432
+
1433
+
1434
+
1435
+
1436
+
1437
+
1438
+
1439
+
1440
+
1441
+
1442
+
1443
+
1444
+
1445
+
1446
+
1447
+
1448
+
1449
+
1450
+
1451
+
1452
+
1453
+
1454
+
1455
+
1456
+
1457
+
1458
+
1459
+
1460
+
1461
+
1462
+
1463
+
1464
+
1465
+
1466
+
1467
+
1468
+
1469
+
1470
+
1471
+
1472
+
1473
+
1474
+
1475
+
1476
+
1477
+
1478
+
1479
+
1480
+
1481
+
1482
+
1483
+
1484
+
1485
+
1486
+
1487
+
1488
+
1489
+
1490
+
1491
+
1492
+
1493
+
1494
+
1495
+
1496
+
1497
+
1498
+
1499
+
1500
+
1501
+
1502
+
1503
+
1504
+
1505
+
1506
+
1507
+
1508
+
1509
+
1510
+
1511
+
1512
+
1513
+
1514
+
1515
+
1516
+
1517
+
1518
+
1519
+
1520
+
1521
+
1522
+
1523
+
1524
+
1525
+
1526
+
1527
+
1528
+
1529
+ @
1530
+
1531
+
1532
+
1533
+
1534
+
1535
+
1536
+
1537
+
1538
+
1539
+
1540
+
1541
+
1542
+
1543
+
1544
+
1545
+
1546
+
1547
+
1548
+
1549
+
1550
+
1551
+
1552
+
1553
+
1554
+
1555
+
1556
+
1557
+
1558
+
1559
+
1560
+
1561
+
1562
+
1563
+ 丿
1564
+
1565
+
1566
+
1567
+
1568
+
1569
+
1570
+
1571
+
1572
+
1573
+
1574
+
1575
+
1576
+
1577
+
1578
+
1579
+
1580
+
1581
+
1582
+
1583
+
1584
+
1585
+
1586
+
1587
+
1588
+
1589
+
1590
+
1591
+
1592
+
1593
+
1594
+
1595
+
1596
+
1597
+
1598
+
1599
+
1600
+
1601
+
1602
+
1603
+
1604
+
1605
+
1606
+
1607
+
1608
+
1609
+
1610
+
1611
+
1612
+
1613
+
1614
+
1615
+
1616
+
1617
+
1618
+
1619
+
1620
+
1621
+
1622
+
1623
+
1624
+
1625
+
1626
+
1627
+
1628
+
1629
+
1630
+
1631
+
1632
+
1633
+
1634
+
1635
+
1636
+
1637
+
1638
+
1639
+
1640
+
1641
+
1642
+
1643
+ 沿
1644
+
1645
+
1646
+
1647
+
1648
+
1649
+
1650
+
1651
+
1652
+
1653
+
1654
+
1655
+
1656
+
1657
+
1658
+
1659
+
1660
+
1661
+
1662
+
1663
+
1664
+
1665
+
1666
+
1667
+
1668
+
1669
+
1670
+
1671
+
1672
+
1673
+
1674
+
1675
+
1676
+
1677
+
1678
+
1679
+
1680
+
1681
+ 使
1682
+
1683
+
1684
+
1685
+
1686
+
1687
+
1688
+
1689
+
1690
+
1691
+
1692
+
1693
+
1694
+
1695
+ 绿
1696
+
1697
+
1698
+
1699
+
1700
+
1701
+
1702
+
1703
+
1704
+
1705
+
1706
+
1707
+
1708
+
1709
+
1710
+
1711
+
1712
+
1713
+
1714
+
1715
+
1716
+
1717
+
1718
+
1719
+
1720
+
1721
+
1722
+
1723
+
1724
+
1725
+
1726
+
1727
+
1728
+
1729
+
1730
+
1731
+
1732
+
1733
+
1734
+
1735
+
1736
+
1737
+
1738
+
1739
+
1740
+
1741
+
1742
+
1743
+
1744
+
1745
+
1746
+
1747
+
1748
+
1749
+
1750
+
1751
+
1752
+
1753
+
1754
+
1755
+
1756
+
1757
+
1758
+
1759
+
1760
+
1761
+
1762
+
1763
+
1764
+
1765
+
1766
+
1767
+
1768
+
1769
+
1770
+
1771
+
1772
+
1773
+
1774
+
1775
+
1776
+
1777
+
1778
+
1779
+
1780
+
1781
+
1782
+
1783
+
1784
+
1785
+
1786
+
1787
+
1788
+
1789
+
1790
+
1791
+
1792
+
1793
+
1794
+
1795
+
1796
+
1797
+
1798
+
1799
+
1800
+
1801
+
1802
+
1803
+
1804
+
1805
+
1806
+
1807
+
1808
+
1809
+
1810
+
1811
+ %
1812
+
1813
+
1814
+
1815
+
1816
+
1817
+
1818
+
1819
+
1820
+
1821
+
1822
+ "
1823
+
1824
+
1825
+
1826
+
1827
+
1828
+
1829
+
1830
+
1831
+
1832
+
1833
+
1834
+
1835
+
1836
+
1837
+ 婿
1838
+
1839
+
1840
+
1841
+
1842
+
1843
+
1844
+
1845
+
1846
+
1847
+
1848
+
1849
+
1850
+
1851
+
1852
+
1853
+
1854
+
1855
+
1856
+
1857
+
1858
+
1859
+
1860
+
1861
+
1862
+
1863
+
1864
+
1865
+
1866
+
1867
+
1868
+
1869
+
1870
+
1871
+
1872
+
1873
+
1874
+
1875
+
1876
+
1877
+
1878
+
1879
+
1880
+
1881
+
1882
+
1883
+
1884
+
1885
+
1886
+
1887
+
1888
+
1889
+
1890
+
1891
+
1892
+
1893
+
1894
+
1895
+
1896
+
1897
+
1898
+
1899
+
1900
+
1901
+
1902
+
1903
+
1904
+
1905
+
1906
+
1907
+
1908
+
1909
+
1910
+
1911
+
1912
+
1913
+
1914
+
1915
+
1916
+
1917
+
1918
+
1919
+
1920
+
1921
+
1922
+
1923
+
1924
+
1925
+
1926
+
1927
+
1928
+
1929
+
1930
+
1931
+
1932
+
1933
+
1934
+
1935
+
1936
+
1937
+
1938
+
1939
+
1940
+
1941
+
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+
1950
+
1951
+
1952
+
1953
+
1954
+
1955
+
1956
+
1957
+
1958
+ r
1959
+
1960
+
1961
+
1962
+
1963
+
1964
+
1965
+
1966
+
1967
+
1968
+
1969
+
1970
+
1971
+
1972
+
1973
+
1974
+
1975
+
1976
+
1977
+
1978
+
1979
+
1980
+
1981
+
1982
+
1983
+
1984
+
1985
+
1986
+
1987
+
1988
+
1989
+ =
1990
+
1991
+
1992
+
1993
+
1994
+
1995
+
1996
+
1997
+
1998
+
1999
+
2000
+ 饿
2001
+
2002
+
2003
+
2004
+
2005
+
2006
+
2007
+
2008
+
2009
+
2010
+
2011
+
2012
+
2013
+
2014
+
2015
+
2016
+
2017
+
2018
+
2019
+
2020
+
2021
+
2022
+
2023
+
2024
+
2025
+
2026
+
2027
+
2028
+
2029
+
2030
+
2031
+
2032
+
2033
+
2034
+
2035
+
2036
+
2037
+
2038
+
2039
+
2040
+
2041
+
2042
+
2043
+
2044
+
2045
+
2046
+
2047
+
2048
+
2049
+
2050
+
2051
+
2052
+
2053
+
2054
+
2055
+
2056
+
2057
+
2058
+
2059
+
2060
+
2061
+
2062
+
2063
+
2064
+
2065
+
2066
+
2067
+
2068
+
2069
+
2070
+
2071
+
2072
+
2073
+
2074
+
2075
+
2076
+
2077
+
2078
+
2079
+
2080
+
2081
+
2082
+
2083
+
2084
+
2085
+
2086
+
2087
+
2088
+
2089
+
2090
+
2091
+
2092
+
2093
+
2094
+
2095
+
2096
+
2097
+
2098
+
2099
+
2100
+
2101
+
2102
+
2103
+
2104
+
2105
+
2106
+
2107
+
2108
+
2109
+
2110
+
2111
+
2112
+
2113
+
2114
+
2115
+ ˇ
2116
+
2117
+
2118
+
2119
+
2120
+
2121
+
2122
+
2123
+
2124
+
2125
+
2126
+
2127
+
2128
+
2129
+
2130
+
2131
+
2132
+
2133
+
2134
+
2135
+
2136
+
2137
+
2138
+
2139
+
2140
+
2141
+
2142
+
2143
+
2144
+
2145
+
2146
+
2147
+
2148
+
2149
+
2150
+
2151
+
2152
+
2153
+
2154
+
2155
+
2156
+ q
2157
+
2158
+
2159
+
2160
+
2161
+
2162
+
2163
+
2164
+
2165
+
2166
+
2167
+
2168
+
2169
+
2170
+
2171
+
2172
+
2173
+
2174
+
2175
+
2176
+
2177
+
2178
+
2179
+
2180
+
2181
+
2182
+
2183
+
2184
+
2185
+
2186
+
2187
+
2188
+
2189
+
2190
+
2191
+
2192
+
2193
+
2194
+
2195
+
2196
+
2197
+
2198
+
2199
+
2200
+
2201
+
2202
+
2203
+
2204
+
2205
+
2206
+
2207
+
2208
+
2209
+
2210
+
2211
+
2212
+
2213
+
2214
+
2215
+
2216
+
2217
+
2218
+
2219
+
2220
+
2221
+
2222
+
2223
+
2224
+
2225
+
2226
+
2227
+
2228
+
2229
+
2230
+
2231
+
2232
+
2233
+
2234
+
2235
+
2236
+
2237
+
2238
+
2239
+
2240
+
2241
+
2242
+
2243
+
2244
+
2245
+
2246
+
2247
+
2248
+
2249
+
2250
+
2251
+
2252
+
2253
+
2254
+
2255
+
2256
+
2257
+
2258
+
2259
+
2260
+
2261
+
2262
+
2263
+
2264
+
2265
+
2266
+
2267
+
2268
+
2269
+ ÷
2270
+
2271
+
2272
+
2273
+
2274
+
2275
+
2276
+
2277
+
2278
+
2279
+
2280
+
2281
+
2282
+
2283
+
2284
+
2285
+
2286
+
2287
+
2288
+
2289
+
2290
+
2291
+
2292
+
2293
+
2294
+
2295
+
2296
+
2297
+
2298
+
2299
+
2300
+
2301
+
2302
+
2303
+
2304
+
2305
+
2306
+
2307
+
2308
+
2309
+
2310
+
2311
+
2312
+
2313
+
2314
+
2315
+
2316
+
2317
+
2318
+
2319
+
2320
+
2321
+
2322
+
2323
+
2324
+
2325
+
2326
+
2327
+
2328
+
2329
+
2330
+
2331
+
2332
+
2333
+
2334
+
2335
+
2336
+
2337
+
2338
+
2339
+
2340
+
2341
+
2342
+
2343
+
2344
+
2345
+
2346
+
2347
+
2348
+
2349
+
2350
+
2351
+
2352
+
2353
+
2354
+
2355
+
2356
+
2357
+
2358
+
2359
+
2360
+
2361
+
2362
+
2363
+
2364
+
2365
+
2366
+
2367
+
2368
+
2369
+
2370
+
2371
+
2372
+
2373
+
2374
+
2375
+
2376
+
2377
+
2378
+
2379
+
2380
+
2381
+ 椿
2382
+
2383
+
2384
+
2385
+ 寿
2386
+
2387
+
2388
+
2389
+
2390
+
2391
+
2392
+
2393
+
2394
+
2395
+
2396
+
2397
+
2398
+
2399
+
2400
+
2401
+
2402
+
2403
+
2404
+
2405
+
2406
+
2407
+
2408
+
2409
+
2410
+
2411
+
2412
+
2413
+
2414
+
2415
+
2416
+
2417
+
2418
+
2419
+
2420
+
2421
+
2422
+
2423
+
2424
+
2425
+
2426
+
2427
+
2428
+
2429
+
2430
+
2431
+
2432
+
2433
+
2434
+
2435
+
2436
+
2437
+
2438
+
2439
+
2440
+
2441
+
2442
+
2443
+
2444
+
2445
+
2446
+
2447
+
2448
+
2449
+
2450
+
2451
+
2452
+
2453
+
2454
+
2455
+
2456
+ ?
2457
+
2458
+
2459
+
2460
+
2461
+
2462
+
2463
+
2464
+
2465
+
2466
+
2467
+
2468
+
2469
+
2470
+
2471
+
2472
+
2473
+
2474
+
2475
+
2476
+
2477
+
2478
+
2479
+
2480
+
2481
+
2482
+
2483
+
2484
+
2485
+
2486
+
2487
+
2488
+
2489
+
2490
+
2491
+
2492
+
2493
+
2494
+
2495
+
2496
+
2497
+
2498
+
2499
+
2500
+
2501
+
2502
+
2503
+
2504
+
2505
+
2506
+
2507
+
2508
+
2509
+
2510
+
2511
+
2512
+
2513
+
2514
+
2515
+
2516
+
2517
+
2518
+
2519
+
2520
+
2521
+
2522
+
2523
+
2524
+
2525
+
2526
+
2527
+
2528
+
2529
+
2530
+
2531
+
2532
+
2533
+
2534
+
2535
+
2536
+
2537
+
2538
+
2539
+
2540
+
2541
+
2542
+
2543
+
2544
+
2545
+
2546
+
2547
+
2548
+
2549
+
2550
+
2551
+
2552
+
2553
+
2554
+
2555
+
2556
+
2557
+
2558
+
2559
+
2560
+
2561
+
2562
+
2563
+
2564
+
2565
+
2566
+
2567
+
2568
+
2569
+
2570
+
2571
+
2572
+
2573
+
2574
+
2575
+
2576
+
2577
+
2578
+
2579
+
2580
+
2581
+
2582
+
2583
+
2584
+
2585
+
2586
+
2587
+
2588
+
2589
+
2590
+
2591
+
2592
+
2593
+
2594
+
2595
+
2596
+
2597
+
2598
+ 便
2599
+
2600
+
2601
+
2602
+
2603
+
2604
+
2605
+
2606
+
2607
+
2608
+
2609
+
2610
+
2611
+
2612
+
2613
+
2614
+
2615
+
2616
+
2617
+
2618
+
2619
+
2620
+
2621
+
2622
+
2623
+
2624
+
2625
+
2626
+
2627
+
2628
+
2629
+
2630
+
2631
+
2632
+
2633
+
2634
+
2635
+
2636
+
2637
+
2638
+
2639
+
2640
+
2641
+
2642
+
2643
+
2644
+
2645
+
2646
+
2647
+
2648
+
2649
+
2650
+
2651
+
2652
+
2653
+
2654
+
2655
+
2656
+
2657
+
2658
+
2659
+
2660
+
2661
+
2662
+
2663
+
2664
+
2665
+
2666
+
2667
+ 殿
2668
+
2669
+
2670
+
2671
+
2672
+
2673
+
2674
+
2675
+
2676
+
2677
+
2678
+
2679
+
2680
+
2681
+
2682
+
2683
+
2684
+
2685
+
2686
+
2687
+
2688
+
2689
+
2690
+
2691
+
2692
+
2693
+
2694
+
2695
+
2696
+
2697
+ J
2698
+
2699
+
2700
+
2701
+
2702
+
2703
+
2704
+
2705
+
2706
+
2707
+
2708
+
2709
+
2710
+ l
2711
+
2712
+
2713
+
2714
+
2715
+
2716
+
2717
+
2718
+
2719
+
2720
+
2721
+
2722
+
2723
+
2724
+
2725
+
2726
+
2727
+
2728
+
2729
+
2730
+
2731
+
2732
+
2733
+
2734
+
2735
+
2736
+
2737
+
2738
+
2739
+
2740
+
2741
+
2742
+
2743
+
2744
+
2745
+
2746
+
2747
+
2748
+
2749
+
2750
+
2751
+
2752
+
2753
+
2754
+
2755
+
2756
+
2757
+
2758
+
2759
+
2760
+
2761
+
2762
+
2763
+
2764
+
2765
+
2766
+
2767
+
2768
+
2769
+
2770
+
2771
+
2772
+
2773
+
2774
+
2775
+
2776
+
2777
+
2778
+
2779
+
2780
+
2781
+
2782
+
2783
+
2784
+
2785
+
2786
+
2787
+
2788
+
2789
+
2790
+
2791
+
2792
+
2793
+
2794
+
2795
+
2796
+
2797
+
2798
+
2799
+
2800
+
2801
+
2802
+
2803
+
2804
+
2805
+
2806
+
2807
+
2808
+
2809
+
2810
+
2811
+
2812
+
2813
+
2814
+
2815
+
2816
+
2817
+
2818
+
2819
+
2820
+
2821
+
2822
+
2823
+
2824
+
2825
+
2826
+
2827
+
2828
+
2829
+
2830
+
2831
+
2832
+
2833
+
2834
+
2835
+
2836
+
2837
+
2838
+
2839
+
2840
+
2841
+
2842
+
2843
+
2844
+
2845
+
2846
+
2847
+
2848
+
2849
+
2850
+
2851
+
2852
+
2853
+
2854
+ &
2855
+
2856
+
2857
+
2858
+
2859
+
2860
+
2861
+
2862
+
2863
+
2864
+
2865
+
2866
+
2867
+
2868
+
2869
+
2870
+
2871
+
2872
+
2873
+
2874
+
2875
+
2876
+
2877
+
2878
+
2879
+
2880
+
2881
+
2882
+
2883
+
2884
+
2885
+
2886
+
2887
+
2888
+
2889
+
2890
+
2891
+
2892
+
2893
+
2894
+
2895
+
2896
+
2897
+
2898
+
2899
+
2900
+
2901
+
2902
+
2903
+
2904
+
2905
+
2906
+
2907
+
2908
+
2909
+
2910
+
2911
+
2912
+
2913
+
2914
+
2915
+
2916
+
2917
+
2918
+
2919
+
2920
+
2921
+
2922
+
2923
+
2924
+
2925
+
2926
+
2927
+
2928
+
2929
+
2930
+
2931
+
2932
+
2933
+
2934
+
2935
+
2936
+
2937
+
2938
+
2939
+
2940
+
2941
+
2942
+
2943
+ 驿
2944
+
2945
+
2946
+
2947
+
2948
+
2949
+
2950
+
2951
+
2952
+
2953
+
2954
+
2955
+
2956
+
2957
+
2958
+
2959
+
2960
+
2961
+
2962
+
2963
+
2964
+
2965
+
2966
+
2967
+
2968
+
2969
+
2970
+
2971
+
2972
+
2973
+
2974
+
2975
+
2976
+
2977
+
2978
+
2979
+
2980
+
2981
+
2982
+
2983
+
2984
+
2985
+
2986
+
2987
+
2988
+
2989
+
2990
+
2991
+
2992
+
2993
+ x
2994
+
2995
+
2996
+
2997
+
2998
+
2999
+
3000
+
3001
+
3002
+
3003
+
3004
+
3005
+
3006
+
3007
+
3008
+
3009
+
3010
+
3011
+
3012
+
3013
+
3014
+
3015
+
3016
+
3017
+
3018
+
3019
+
3020
+
3021
+
3022
+ 耀
3023
+
3024
+
3025
+
3026
+
3027
+
3028
+
3029
+
3030
+
3031
+
3032
+
3033
+
3034
+
3035
+
3036
+
3037
+
3038
+
3039
+
3040
+
3041
+
3042
+
3043
+
3044
+
3045
+
3046
+
3047
+
3048
+
3049
+
3050
+
3051
+
3052
+
3053
+
3054
+
3055
+
3056
+
3057
+
3058
+
3059
+
3060
+
3061
+
3062
+
3063
+
3064
+
3065
+
3066
+
3067
+
3068
+
3069
+
3070
+
3071
+
3072
+ 仿
3073
+
3074
+
3075
+
3076
+
3077
+
3078
+
3079
+
3080
+
3081
+
3082
+
3083
+
3084
+
3085
+
3086
+
3087
+
3088
+
3089
+
3090
+
3091
+
3092
+
3093
+
3094
+
3095
+
3096
+
3097
+
3098
+
3099
+
3100
+
3101
+
3102
+
3103
+
3104
+
3105
+
3106
+
3107
+
3108
+
3109
+
3110
+
3111
+
3112
+
3113
+
3114
+
3115
+
3116
+
3117
+
3118
+
3119
+
3120
+
3121
+
3122
+
3123
+ 鸿
3124
+
3125
+
3126
+
3127
+
3128
+
3129
+
3130
+
3131
+
3132
+
3133
+
3134
+
3135
+
3136
+
3137
+
3138
+
3139
+
3140
+
3141
+
3142
+
3143
+
3144
+
3145
+
3146
+
3147
+
3148
+
3149
+
3150
+
3151
+
3152
+
3153
+
3154
+
3155
+
3156
+
3157
+
3158
+
3159
+
3160
+
3161
+
3162
+
3163
+
3164
+
3165
+
3166
+
3167
+
3168
+
3169
+
3170
+
3171
+
3172
+
3173
+
3174
+
3175
+
3176
+
3177
+
3178
+
3179
+
3180
+
3181
+
3182
+
3183
+
3184
+
3185
+
3186
+
3187
+
3188
+
3189
+
3190
+
3191
+
3192
+
3193
+
3194
+
3195
+
3196
+
3197
+
3198
+
3199
+
3200
+
3201
+
3202
+
3203
+
3204
+
3205
+
3206
+
3207
+
3208
+
3209
+
3210
+
3211
+
3212
+
3213
+
3214
+
3215
+
3216
+
3217
+
3218
+
3219
+
3220
+
3221
+
3222
+
3223
+
3224
+
3225
+
3226
+
3227
+
3228
+
3229
+
3230
+
3231
+
3232
+
3233
+
3234
+
3235
+
3236
+
3237
+
3238
+
3239
+ 廿
3240
+
3241
+
3242
+
3243
+
3244
+
3245
+
3246
+
3247
+
3248
+
3249
+
3250
+
3251
+
3252
+
3253
+
3254
+
3255
+
3256
+
3257
+
3258
+
3259
+
3260
+
3261
+
3262
+
3263
+
3264
+
3265
+
3266
+
3267
+
3268
+
3269
+
3270
+
3271
+
3272
+
3273
+
3274
+
3275
+
3276
+
3277
+
3278
+
3279
+
3280
+
3281
+
3282
+
3283
+
3284
+
3285
+
3286
+
3287
+
3288
+
3289
+
3290
+
3291
+
3292
+
3293
+
3294
+
3295
+
3296
+
3297
+
3298
+
3299
+
3300
+
3301
+
3302
+
3303
+
3304
+
3305
+
3306
+
3307
+
3308
+
3309
+
3310
+
3311
+
3312
+
3313
+
3314
+
3315
+
3316
+ z
3317
+
3318
+
3319
+ ±
3320
+
3321
+
3322
+
3323
+
3324
+
3325
+
3326
+
3327
+
3328
+
3329
+
3330
+
3331
+
3332
+ e
3333
+ t
3334
+
3335
+
3336
+
3337
+
3338
+
3339
+
3340
+
3341
+
3342
+
3343
+
3344
+
3345
+
3346
+
3347
+
3348
+
3349
+
3350
+
3351
+
3352
+
3353
+
3354
+
3355
+
3356
+
3357
+
3358
+
3359
+
3360
+
3361
+
3362
+
3363
+
3364
+
3365
+
3366
+
3367
+
3368
+
3369
+
3370
+
3371
+
3372
+
3373
+
3374
+
3375
+
3376
+
3377
+
3378
+
3379
+
3380
+ §
3381
+
3382
+
3383
+
3384
+
3385
+
3386
+
3387
+
3388
+
3389
+
3390
+
3391
+
3392
+
3393
+
3394
+
3395
+
3396
+
3397
+
3398
+
3399
+
3400
+ 姿
3401
+
3402
+
3403
+
3404
+
3405
+
3406
+
3407
+
3408
+
3409
+
3410
+
3411
+
3412
+
3413
+
3414
+
3415
+
3416
+
3417
+
3418
+
3419
+
3420
+
3421
+
3422
+
3423
+
3424
+
3425
+
3426
+
3427
+
3428
+
3429
+
3430
+
3431
+
3432
+
3433
+
3434
+
3435
+
3436
+
3437
+
3438
+
3439
+
3440
+
3441
+
3442
+
3443
+
3444
+
3445
+
3446
+
3447
+
3448
+
3449
+
3450
+
3451
+
3452
+
3453
+
3454
+
3455
+
3456
+
3457
+
3458
+
3459
+
3460
+
3461
+
3462
+
3463
+ b
3464
+
3465
+
3466
+
3467
+
3468
+
3469
+
3470
+
3471
+
3472
+
3473
+
3474
+
3475
+
3476
+
3477
+
3478
+
3479
+
3480
+
3481
+
3482
+
3483
+
3484
+
3485
+
3486
+
3487
+
3488
+
3489
+
3490
+ <
3491
+
3492
+
3493
+
3494
+
3495
+
3496
+
3497
+
3498
+
3499
+
3500
+
3501
+
3502
+
3503
+
3504
+
3505
+ 退
3506
+ L
3507
+
3508
+
3509
+
3510
+
3511
+
3512
+
3513
+
3514
+
3515
+
3516
+
3517
+ 鹿
3518
+
3519
+
3520
+
3521
+
3522
+
3523
+
3524
+
3525
+
3526
+
3527
+
3528
+
3529
+
3530
+
3531
+
3532
+
3533
+
3534
+
3535
+
3536
+
3537
+ w
3538
+ i
3539
+ h
3540
+
3541
+
3542
+
3543
+
3544
+
3545
+
3546
+
3547
+
3548
+
3549
+
3550
+
3551
+
3552
+
3553
+
3554
+
3555
+
3556
+
3557
+
3558
+
3559
+
3560
+
3561
+
3562
+
3563
+
3564
+
3565
+
3566
+
3567
+
3568
+
3569
+
3570
+
3571
+
3572
+
3573
+ +
3574
+
3575
+
3576
+
3577
+
3578
+
3579
+
3580
+
3581
+
3582
+
3583
+
3584
+
3585
+
3586
+
3587
+ I
3588
+ B
3589
+ N
3590
+
3591
+
3592
+
3593
+
3594
+
3595
+
3596
+
3597
+
3598
+
3599
+
3600
+
3601
+
3602
+
3603
+
3604
+
3605
+
3606
+
3607
+
3608
+
3609
+
3610
+
3611
+
3612
+
3613
+
3614
+
3615
+
3616
+
3617
+
3618
+
3619
+
3620
+
3621
+
3622
+
3623
+
3624
+
3625
+
3626
+ ^
3627
+ _
3628
+
3629
+
3630
+
3631
+
3632
+
3633
+
3634
+
3635
+
3636
+
3637
+
3638
+
3639
+ M
3640
+
3641
+
3642
+
3643
+
3644
+
3645
+
3646
+
3647
+
3648
+
3649
+
3650
+
3651
+
3652
+
3653
+
3654
+
3655
+
3656
+
3657
+
3658
+
3659
+
3660
+
3661
+
3662
+
3663
+
3664
+
3665
+
3666
+
3667
+
3668
+
3669
+
3670
+
3671
+
3672
+
3673
+ 鱿
3674
+
3675
+
3676
+
3677
+
3678
+
3679
+
3680
+
3681
+
3682
+
3683
+
3684
+
3685
+
3686
+
3687
+
3688
+
3689
+
3690
+
3691
+
3692
+
3693
+
3694
+
3695
+
3696
+
3697
+
3698
+
3699
+
3700
+
3701
+
3702
+
3703
+
3704
+
3705
+
3706
+
3707
+
3708
+
3709
+
3710
+
3711
+
3712
+
3713
+
3714
+
3715
+
3716
+
3717
+
3718
+
3719
+
3720
+
3721
+
3722
+
3723
+
3724
+
3725
+
3726
+
3727
+
3728
+
3729
+
3730
+
3731
+
3732
+
3733
+
3734
+
3735
+
3736
+
3737
+
3738
+
3739
+
3740
+
3741
+
3742
+
3743
+
3744
+
3745
+
3746
+
3747
+
3748
+
3749
+
3750
+
3751
+
3752
+
3753
+
3754
+
3755
+
3756
+
3757
+
3758
+
3759
+
3760
+
3761
+
3762
+
3763
+
3764
+
3765
+
3766
+
3767
+
3768
+
3769
+
3770
+
3771
+
3772
+
3773
+
3774
+
3775
+
3776
+
3777
+
3778
+
3779
+
3780
+
3781
+
3782
+
3783
+
3784
+
3785
+
3786
+
3787
+
3788
+
3789
+
3790
+
3791
+
3792
+
3793
+
3794
+
3795
+
3796
+
3797
+
3798
+
3799
+
3800
+
3801
+
3802
+
3803
+
3804
+
3805
+
3806
+
3807
+
3808
+
3809
+
3810
+
3811
+
3812
+
3813
+
3814
+
3815
+
3816
+
3817
+
3818
+
3819
+
3820
+
3821
+
3822
+
3823
+
3824
+
3825
+
3826
+
3827
+
3828
+
3829
+
3830
+
3831
+
3832
+
3833
+
3834
+
3835
+
3836
+
3837
+
3838
+
3839
+
3840
+
3841
+
3842
+
3843
+
3844
+
3845
+
3846
+
3847
+
3848
+
3849
+
3850
+
3851
+
3852
+
3853
+
3854
+
3855
+
3856
+
3857
+
3858
+
3859
+
3860
+
3861
+
3862
+
3863
+
3864
+
3865
+
3866
+
3867
+
3868
+
3869
+
3870
+
3871
+
3872
+
3873
+
3874
+
3875
+
3876
+
3877
+ 怀
3878
+
3879
+
3880
+
3881
+
3882
+
3883
+
3884
+
3885
+
3886
+
3887
+
3888
+
3889
+
3890
+
3891
+
3892
+
3893
+
3894
+
3895
+
3896
+
3897
+
3898
+
3899
+
3900
+
3901
+
3902
+
3903
+
3904
+
3905
+
3906
+
3907
+
3908
+
3909
+
3910
+
3911
+
3912
+
3913
+
3914
+
3915
+
3916
+
3917
+
3918
+
3919
+
3920
+
3921
+
3922
+
3923
+
3924
+
3925
+
3926
+
3927
+
3928
+
3929
+
3930
+
3931
+
3932
+
3933
+
3934
+
3935
+
3936
+
3937
+
3938
+
3939
+
3940
+
3941
+
3942
+
3943
+
3944
+
3945
+
3946
+
3947
+
3948
+
3949
+
3950
+
3951
+
3952
+
3953
+
3954
+
3955
+
3956
+
3957
+
3958
+
3959
+
3960
+
3961
+
3962
+
3963
+
3964
+
3965
+
3966
+
3967
+
3968
+
3969
+
3970
+
3971
+
3972
+
3973
+
3974
+
3975
+
3976
+
3977
+
3978
+
3979
+
3980
+
3981
+
3982
+
3983
+
3984
+
3985
+
3986
+
3987
+
3988
+
3989
+
3990
+
3991
+
3992
+
3993
+
3994
+
3995
+
3996
+
3997
+
3998
+
3999
+
4000
+
4001
+
4002
+
4003
+
4004
+
4005
+
4006
+
4007
+
4008
+
4009
+
4010
+
4011
+
4012
+
4013
+
4014
+
4015
+
4016
+
4017
+
4018
+
4019
+
4020
+
4021
+
4022
+
4023
+
4024
+
4025
+
4026
+
4027
+
4028
+
4029
+
4030
+
4031
+
4032
+
4033
+
4034
+
4035
+
4036
+
4037
+
4038
+
4039
+
4040
+
4041
+
4042
+
4043
+
4044
+
4045
+
4046
+
4047
+
4048
+
4049
+
4050
+
4051
+
4052
+
4053
+
4054
+
4055
+
4056
+
4057
+
4058
+
4059
+
4060
+
4061
+
4062
+
4063
+
4064
+
4065
+
4066
+
4067
+
4068
+
4069
+
4070
+
4071
+
4072
+
4073
+
4074
+
4075
+
4076
+
4077
+
4078
+ }
4079
+
4080
+
4081
+
4082
+
4083
+
4084
+
4085
+
4086
+
4087
+
4088
+
4089
+
4090
+
4091
+
4092
+
4093
+
4094
+
4095
+
4096
+
4097
+
4098
+
4099
+
4100
+
4101
+
4102
+
4103
+
4104
+
4105
+
4106
+
4107
+
4108
+
4109
+
4110
+
4111
+
4112
+
4113
+
4114
+
4115
+
4116
+
4117
+
4118
+
4119
+
4120
+
4121
+
4122
+
4123
+
4124
+
4125
+
4126
+
4127
+
4128
+ ~
4129
+
4130
+
4131
+
4132
+
4133
+
4134
+
4135
+
4136
+ Z
4137
+
4138
+
4139
+
4140
+
4141
+
4142
+
4143
+
4144
+
4145
+
4146
+
4147
+
4148
+
4149
+
4150
+
4151
+
4152
+
4153
+
4154
+
4155
+
4156
+
4157
+
4158
+
4159
+
4160
+
4161
+
4162
+
4163
+
4164
+
4165
+
4166
+
4167
+
4168
+
4169
+
4170
+
4171
+
4172
+
4173
+
4174
+
4175
+
4176
+
4177
+
4178
+
4179
+
4180
+
4181
+
4182
+
4183
+
4184
+
4185
+
4186
+
4187
+
4188
+
4189
+
4190
+
4191
+
4192
+
4193
+
4194
+
4195
+
4196
+
4197
+
4198
+
4199
+
4200
+
4201
+
4202
+
4203
+
4204
+
4205
+
4206
+
4207
+
4208
+
4209
+
4210
+
4211
+
4212
+
4213
+
4214
+
4215
+
4216
+
4217
+
4218
+
4219
+
4220
+
4221
+
4222
+
4223
+
4224
+
4225
+
4226
+
4227
+
4228
+
4229
+
4230
+
4231
+
4232
+
4233
+
4234
+
4235
+
4236
+ 槿
4237
+
4238
+
4239
+
4240
+
4241
+
4242
+
4243
+
4244
+ C
4245
+ o
4246
+
4247
+
4248
+
4249
+
4250
+
4251
+
4252
+
4253
+
4254
+
4255
+
4256
+
4257
+
4258
+
4259
+
4260
+
4261
+
4262
+
4263
+
4264
+
4265
+
4266
+
4267
+
4268
+
4269
+
4270
+
4271
+
4272
+
4273
+
4274
+
4275
+
4276
+
4277
+
4278
+
4279
+
4280
+
4281
+
4282
+
4283
+
4284
+
4285
+
4286
+
4287
+
4288
+
4289
+
4290
+
4291
+
4292
+
4293
+
4294
+
4295
+
4296
+
4297
+
4298
+
4299
+
4300
+
4301
+
4302
+
4303
+
4304
+
4305
+
4306
+
4307
+
4308
+
4309
+
4310
+
4311
+
4312
+
4313
+
4314
+
4315
+
4316
+
4317
+
4318
+
4319
+
4320
+
4321
+
4322
+
4323
+
4324
+
4325
+
4326
+
4327
+
4328
+
4329
+
4330
+
4331
+
4332
+
4333
+
4334
+
4335
+
4336
+
4337
+
4338
+
4339
+
4340
+
4341
+
4342
+
4343
+
4344
+
4345
+
4346
+
4347
+
4348
+
4349
+
4350
+
4351
+
4352
+
4353
+
4354
+
4355
+
4356
+
4357
+
4358
+
4359
+
4360
+
4361
+
4362
+
4363
+
4364
+
4365
+
4366
+
4367
+
4368
+
4369
+
4370
+
4371
+
4372
+
4373
+
4374
+
4375
+
4376
+
4377
+
4378
+
4379
+
4380
+
4381
+ E
4382
+
4383
+
4384
+
4385
+
4386
+
4387
+
4388
+
4389
+ f
4390
+
4391
+
4392
+
4393
+
4394
+
4395
+
4396
+
4397
+
4398
+
4399
+
4400
+
4401
+
4402
+
4403
+
4404
+
4405
+
4406
+
4407
+
4408
+
4409
+
4410
+
4411
+
4412
+
4413
+
4414
+
4415
+
4416
+
4417
+
4418
+
4419
+
4420
+
4421
+
4422
+
4423
+
4424
+
4425
+
4426
+
4427
+
4428
+ \
4429
+
4430
+
4431
+
4432
+
4433
+
4434
+
4435
+
4436
+
4437
+
4438
+
4439
+
4440
+
4441
+
4442
+
4443
+
4444
+
4445
+
4446
+
4447
+
4448
+
4449
+
4450
+
4451
+
4452
+
4453
+
4454
+
4455
+
4456
+
4457
+
4458
+
4459
+
4460
+
4461
+
4462
+
4463
+
4464
+
4465
+
4466
+
4467
+
4468
+
4469
+
4470
+
4471
+
4472
+
4473
+ 屿
4474
+
4475
+
4476
+
4477
+
4478
+
4479
+
4480
+
4481
+
4482
+
4483
+
4484
+
4485
+
4486
+
4487
+
4488
+
4489
+
4490
+
4491
+
4492
+
4493
+
4494
+
4495
+
4496
+
4497
+ U
4498
+
4499
+
4500
+
4501
+
4502
+
4503
+
4504
+
4505
+
4506
+
4507
+
4508
+
4509
+
4510
+
4511
+
4512
+
4513
+
4514
+
4515
+
4516
+
4517
+
4518
+
4519
+
4520
+
4521
+
4522
+
4523
+
4524
+
4525
+
4526
+
4527
+
4528
+
4529
+
4530
+
4531
+
4532
+
4533
+
4534
+
4535
+
4536
+
4537
+
4538
+
4539
+
4540
+
4541
+
4542
+
4543
+
4544
+ a
4545
+ p
4546
+ y
4547
+ n
4548
+ g
4549
+
4550
+
4551
+
4552
+
4553
+
4554
+
4555
+
4556
+
4557
+
4558
+
4559
+
4560
+
4561
+
4562
+
4563
+
4564
+
4565
+
4566
+
4567
+
4568
+
4569
+
4570
+
4571
+
4572
+
4573
+
4574
+
4575
+
4576
+
4577
+
4578
+
4579
+
4580
+
4581
+
4582
+
4583
+
4584
+
4585
+
4586
+
4587
+
4588
+
4589
+
4590
+
4591
+
4592
+
4593
+
4594
+
4595
+
4596
+
4597
+
4598
+
4599
+
4600
+
4601
+
4602
+
4603
+
4604
+
4605
+
4606
+
4607
+
4608
+
4609
+
4610
+
4611
+
4612
+
4613
+
4614
+
4615
+
4616
+
4617
+
4618
+
4619
+
4620
+
4621
+
4622
+
4623
+
4624
+
4625
+
4626
+
4627
+
4628
+
4629
+
4630
+
4631
+
4632
+
4633
+
4634
+
4635
+
4636
+
4637
+
4638
+
4639
+
4640
+
4641
+
4642
+
4643
+
4644
+
4645
+
4646
+
4647
+
4648
+
4649
+
4650
+
4651
+
4652
+
4653
+
4654
+
4655
+
4656
+
4657
+
4658
+
4659
+
4660
+
4661
+
4662
+
4663
+
4664
+
4665
+
4666
+
4667
+
4668
+
4669
+
4670
+
4671
+
4672
+
4673
+
4674
+
4675
+
4676
+
4677
+
4678
+
4679
+
4680
+
4681
+
4682
+
4683
+
4684
+
4685
+
4686
+
4687
+
4688
+
4689
+
4690
+
4691
+
4692
+
4693
+
4694
+
4695
+
4696
+
4697
+
4698
+
4699
+
4700
+
4701
+
4702
+
4703
+
4704
+
4705
+
4706
+
4707
+ 竿
4708
+
4709
+
4710
+
4711
+
4712
+
4713
+
4714
+
4715
+
4716
+
4717
+
4718
+
4719
+
4720
+
4721
+
4722
+
4723
+
4724
+
4725
+
4726
+
4727
+
4728
+
4729
+
4730
+
4731
+ Q
4732
+
4733
+
4734
+
4735
+
4736
+
4737
+
4738
+
4739
+ 羿
4740
+
4741
+ O
4742
+
4743
+
4744
+
4745
+
4746
+
4747
+
4748
+
4749
+
4750
+
4751
+
4752
+
4753
+
4754
+
4755
+
4756
+
4757
+
4758
+
4759
+ 宿
4760
+
4761
+
4762
+
4763
+
4764
+
4765
+
4766
+
4767
+
4768
+
4769
+
4770
+
4771
+
4772
+
4773
+
4774
+
4775
+
4776
+
4777
+
4778
+
4779
+
4780
+
4781
+
4782
+
4783
+
4784
+
4785
+
4786
+
4787
+
4788
+
4789
+
4790
+
4791
+
4792
+
4793
+
4794
+
4795
+
4796
+
4797
+
4798
+
4799
+
4800
+
4801
+
4802
+
4803
+
4804
+
4805
+
4806
+
4807
+
4808
+
4809
+
4810
+
4811
+
4812
+
4813
+
4814
+
4815
+
4816
+
4817
+
4818
+
4819
+
4820
+
4821
+
4822
+
4823
+
4824
+
4825
+
4826
+
4827
+
4828
+
4829
+
4830
+
4831
+
4832
+
4833
+
4834
+
4835
+
4836
+
4837
+
4838
+
4839
+
4840
+
4841
+
4842
+
4843
+
4844
+
4845
+
4846
+
4847
+
4848
+
4849
+ k
4850
+
4851
+
4852
+
4853
+
4854
+
4855
+
4856
+
4857
+
4858
+
4859
+
4860
+
4861
+
4862
+
4863
+
4864
+
4865
+
4866
+
4867
+
4868
+
4869
+
4870
+
4871
+
4872
+
4873
+
4874
+
4875
+
4876
+
4877
+
4878
+
4879
+
4880
+
4881
+
4882
+
4883
+
4884
+
4885
+ $
4886
+
4887
+
4888
+
4889
+
4890
+
4891
+
4892
+
4893
+
4894
+
4895
+
4896
+
4897
+
4898
+
4899
+
4900
+
4901
+
4902
+ c
4903
+
4904
+
4905
+
4906
+
4907
+
4908
+
4909
+
4910
+
4911
+
4912
+
4913
+
4914
+
4915
+
4916
+
4917
+
4918
+
4919
+
4920
+
4921
+
4922
+ v
4923
+
4924
+
4925
+
4926
+
4927
+
4928
+
4929
+
4930
+
4931
+
4932
+
4933
+
4934
+
4935
+
4936
+
4937
+
4938
+
4939
+
4940
+
4941
+
4942
+
4943
+
4944
+
4945
+
4946
+
4947
+
4948
+
4949
+
4950
+
4951
+
4952
+
4953
+
4954
+
4955
+
4956
+
4957
+
4958
+
4959
+
4960
+
4961
+
4962
+
4963
+
4964
+
4965
+
4966
+
4967
+
4968
+
4969
+
4970
+
4971
+
4972
+
4973
+
4974
+
4975
+
4976
+
4977
+
4978
+
4979
+
4980
+
4981
+
4982
+
4983
+
4984
+
4985
+
4986
+
4987
+
4988
+
4989
+
4990
+
4991
+
4992
+
4993
+
4994
+
4995
+
4996
+
4997
+
4998
+
4999
+
5000
+
5001
+
5002
+
5003
+
5004
+
5005
+
5006
+
5007
+
5008
+
5009
+
5010
+
5011
+
5012
+
5013
+
5014
+
5015
+
5016
+
5017
+
5018
+
5019
+
5020
+
5021
+
5022
+
5023
+
5024
+
5025
+
5026
+
5027
+
5028
+
5029
+
5030
+
5031
+
5032
+
5033
+ W
5034
+
5035
+
5036
+
5037
+
5038
+
5039
+
5040
+
5041
+
5042
+
5043
+
5044
+
5045
+ 穿
5046
+
5047
+
5048
+
5049
+
5050
+
5051
+
5052
+
5053
+
5054
+
5055
+
5056
+
5057
+
5058
+
5059
+
5060
+
5061
+
5062
+
5063
+
5064
+
5065
+
5066
+
5067
+
5068
+
5069
+
5070
+
5071
+
5072
+
5073
+
5074
+
5075
+
5076
+
5077
+
5078
+
5079
+
5080
+
5081
+
5082
+
5083
+
5084
+
5085
+
5086
+ ×
5087
+
5088
+
5089
+
5090
+
5091
+
5092
+
5093
+
5094
+
5095
+
5096
+
5097
+
5098
+
5099
+ 轿
5100
+
5101
+
5102
+
5103
+
5104
+
5105
+
5106
+
5107
+
5108
+
5109
+
5110
+
5111
+
5112
+
5113
+
5114
+
5115
+
5116
+
5117
+
5118
+
5119
+
5120
+
5121
+
5122
+
5123
+
5124
+
5125
+
5126
+
5127
+ R
5128
+ G
5129
+
5130
+
5131
+
5132
+
5133
+
5134
+
5135
+
5136
+
5137
+
5138
+
5139
+
5140
+
5141
+
5142
+
5143
+
5144
+
5145
+
5146
+
5147
+
5148
+
5149
+
5150
+
5151
+
5152
+
5153
+
5154
+
5155
+
5156
+
5157
+
5158
+
5159
+
5160
+
5161
+
5162
+
5163
+
5164
+
5165
+
5166
+
5167
+
5168
+
5169
+ ˉ
5170
+
5171
+ d
5172
+ °
5173
+
5174
+
5175
+
5176
+
5177
+
5178
+
5179
+
5180
+
5181
+
5182
+
5183
+
5184
+
5185
+
5186
+
5187
+
5188
+
5189
+
5190
+
5191
+
5192
+
5193
+ K
5194
+
5195
+
5196
+
5197
+
5198
+
5199
+
5200
+ X
5201
+
5202
+
5203
+
5204
+
5205
+
5206
+
5207
+
5208
+
5209
+
5210
+
5211
+
5212
+
5213
+
5214
+
5215
+
5216
+
5217
+
5218
+
5219
+
5220
+
5221
+
5222
+
5223
+
5224
+
5225
+
5226
+
5227
+
5228
+
5229
+
5230
+
5231
+
5232
+
5233
+ m
5234
+
5235
+
5236
+
5237
+
5238
+
5239
+
5240
+
5241
+
5242
+
5243
+
5244
+ 涿
5245
+
5246
+
5247
+
5248
+
5249
+
5250
+
5251
+
5252
+
5253
+
5254
+
5255
+
5256
+
5257
+
5258
+
5259
+
5260
+
5261
+
5262
+
5263
+
5264
+
5265
+
5266
+
5267
+
5268
+
5269
+
5270
+
5271
+
5272
+
5273
+
5274
+
5275
+
5276
+
5277
+
5278
+
5279
+
5280
+
5281
+
5282
+
5283
+
5284
+
5285
+
5286
+
5287
+
5288
+
5289
+
5290
+
5291
+
5292
+
5293
+
5294
+
5295
+
5296
+
5297
+
5298
+
5299
+
5300
+
5301
+
5302
+
5303
+
5304
+
5305
+
5306
+
5307
+
5308
+
5309
+
5310
+
5311
+
5312
+
5313
+
5314
+
5315
+
5316
+
5317
+
5318
+
5319
+
5320
+
5321
+
5322
+
5323
+
5324
+
5325
+
5326
+
5327
+
5328
+
5329
+
5330
+
5331
+
5332
+
5333
+
5334
+
5335
+ `
5336
+
5337
+
5338
+
5339
+
5340
+
5341
+
5342
+
5343
+
5344
+
5345
+
5346
+
5347
+
5348
+
5349
+
5350
+
5351
+
5352
+
5353
+
5354
+
5355
+
5356
+
5357
+
5358
+
5359
+
5360
+
5361
+
5362
+
5363
+
5364
+
5365
+
5366
+
5367
+
5368
+
5369
+
5370
+
5371
+
5372
+
5373
+
5374
+
5375
+
5376
+
5377
+
5378
+
5379
+
5380
+
5381
+
5382
+
5383
+
5384
+
5385
+
5386
+
5387
+
5388
+
5389
+
5390
+
5391
+
5392
+
5393
+
5394
+
5395
+
5396
+
5397
+
5398
+
5399
+
5400
+
5401
+
5402
+
5403
+
5404
+
5405
+ V
5406
+
5407
+
5408
+
5409
+
5410
+
5411
+
5412
+
5413
+
5414
+
5415
+
5416
+
5417
+
5418
+
5419
+
5420
+
5421
+
5422
+
5423
+
5424
+
5425
+
5426
+
5427
+
5428
+
5429
+
5430
+
5431
+
5432
+
5433
+
5434
+
5435
+
5436
+
5437
+
5438
+
5439
+
5440
+
5441
+
5442
+
5443
+
5444
+
5445
+
5446
+
5447
+
5448
+
5449
+
5450
+
5451
+
5452
+
5453
+
5454
+
5455
+
5456
+
5457
+
5458
+
5459
+
5460
+
5461
+ #
5462
+
5463
+
5464
+
5465
+
5466
+
5467
+
5468
+
5469
+
5470
+
5471
+
5472
+
5473
+
5474
+
5475
+
5476
+
5477
+
5478
+
5479
+
5480
+
5481
+
5482
+
5483
+ 簿
5484
+
5485
+
5486
+
5487
+
5488
+
5489
+ {
5490
+
5491
+
5492
+
5493
+ j
5494
+
5495
+
5496
+
5497
+
5498
+
5499
+
5500
+
5501
+
5502
+
5503
+
5504
+
5505
+
5506
+
5507
+
5508
+
5509
+
5510
+
5511
+
5512
+
5513
+
5514
+
5515
+
5516
+
5517
+
5518
+
5519
+
5520
+
5521
+
5522
+
5523
+
5524
+
5525
+
5526
+
5527
+
5528
+
5529
+ ·
5530
+
5531
+
5532
+
5533
+ Ë
5534
+
5535
+
5536
+
5537
+
5538
+
5539
+
5540
+
5541
+
5542
+
5543
+
5544
+
5545
+
5546
+ ¥
5547
+
5548
+
5549
+
5550
+
5551
+
5552
+
5553
+
5554
+
5555
+
5556
+
5557
+
5558
+
5559
+
5560
+ π
5561
+
5562
+
5563
+
5564
+ é
5565
+
5566
+
5567
+ Λ
5568
+
5569
+
5570
+
5571
+
5572
+
5573
+
5574
+
5575
+
5576
+
5577
+
5578
+
5579
+
5580
+
5581
+
5582
+
5583
+
5584
+
5585
+
5586
+
5587
+
5588
+
5589
+
5590
+
5591
+
5592
+
5593
+
5594
+
5595
+
5596
+
5597
+
5598
+
5599
+
5600
+
5601
+
5602
+
5603
+
5604
+
5605
+ Ο
5606
+
5607
+
5608
+
5609
+
5610
+
5611
+
5612
+
5613
+
5614
+
5615
+
5616
+
5617
+
5618
+
5619
+
5620
+
5621
+
5622
+
5623
+
5624
+
5625
+
5626
+
5627
+
5628
+
5629
+
5630
+
5631
+
5632
+
5633
+
5634
+
5635
+
5636
+
5637
+
5638
+
5639
+
5640
+
5641
+
5642
+
5643
+
5644
+
5645
+
5646
+
5647
+
5648
+
5649
+
5650
+
5651
+
5652
+
5653
+
5654
+
5655
+
5656
+
5657
+
5658
+
5659
+
5660
+
5661
+
5662
+
5663
+
5664
+
5665
+
5666
+
5667
+
5668
+
5669
+
5670
+
5671
+
5672
+
5673
+
5674
+ α
5675
+
5676
+
5677
+
5678
+
5679
+
5680
+
5681
+
5682
+
5683
+
5684
+
5685
+
5686
+
5687
+
5688
+
5689
+
5690
+
5691
+
5692
+
5693
+
5694
+
5695
+
5696
+
5697
+
5698
+
5699
+
5700
+
5701
+
5702
+
5703
+
5704
+
5705
+
5706
+
5707
+
5708
+
5709
+
5710
+  
5711
+
5712
+
5713
+
5714
+
5715
+
5716
+
5717
+
5718
+
5719
+
5720
+
5721
+
5722
+
5723
+
5724
+
5725
+
5726
+
5727
+
5728
+ 鴿
5729
+
5730
+
5731
+
5732
+
5733
+
5734
+
5735
+
5736
+
5737
+
5738
+
5739
+
5740
+
5741
+
5742
+
5743
+
5744
+
5745
+
5746
+
5747
+
5748
+
5749
+
5750
+
5751
+
5752
+
5753
+
5754
+
5755
+
5756
+
5757
+
5758
+
5759
+
5760
+
5761
+
5762
+
5763
+
5764
+
5765
+
5766
+
5767
+
5768
+
5769
+
5770
+
5771
+
5772
+
5773
+
5774
+
5775
+
5776
+
5777
+
5778
+
5779
+
5780
+
5781
+
5782
+
5783
+
5784
+
5785
+
5786
+
5787
+
5788
+
5789
+
5790
+
5791
+
5792
+
5793
+
5794
+
5795
+
5796
+
5797
+
5798
+
5799
+
5800
+ è
5801
+
5802
+
5803
+
5804
+
5805
+
5806
+ Ü
5807
+
5808
+
5809
+
5810
+
5811
+
5812
+
5813
+
5814
+
5815
+
5816
+
5817
+ И
5818
+
5819
+
5820
+
5821
+
5822
+
5823
+
5824
+
5825
+
5826
+
5827
+
5828
+
5829
+
5830
+
5831
+
5832
+
5833
+
5834
+
5835
+
5836
+
5837
+
5838
+ »
5839
+
5840
+
5841
+ ä
5842
+
5843
+
5844
+
5845
+
5846
+
5847
+
5848
+
5849
+
5850
+
5851
+
5852
+
5853
+
5854
+
5855
+
5856
+
5857
+
5858
+
5859
+
5860
+
5861
+
5862
+
5863
+
5864
+
5865
+
5866
+
5867
+
5868
+
5869
+
5870
+
5871
+
5872
+
5873
+
5874
+
5875
+
5876
+ ɔ
5877
+
5878
+
5879
+
5880
+
5881
+
5882
+
5883
+ ´
5884
+
5885
+
5886
+
5887
+
5888
+ í
5889
+
5890
+
5891
+
5892
+
5893
+
5894
+
5895
+
5896
+
5897
+
5898
+
5899
+
5900
+
5901
+
5902
+
5903
+
5904
+
5905
+
5906
+
5907
+
5908
+
5909
+ É
5910
+
5911
+
5912
+
5913
+
5914
+ ʌ
5915
+
5916
+
5917
+
5918
+
5919
+
5920
+
5921
+
5922
+
5923
+
5924
+
5925
+ Я
5926
+ Й
5927
+
5928
+
5929
+
5930
+
5931
+
5932
+
5933
+
5934
+
5935
+
5936
+
5937
+
5938
+
5939
+
5940
+
5941
+
5942
+
5943
+
5944
+
5945
+
5946
+
5947
+
5948
+ 粿
5949
+
5950
+
5951
+
5952
+
5953
+ ®
5954
+
5955
+
5956
+
5957
+
5958
+
5959
+
5960
+
5961
+
5962
+
5963
+
5964
+
5965
+
5966
+ З
5967
+
5968
+
5969
+
5970
+
5971
+
5972
+
5973
+
5974
+
5975
+
5976
+ β
5977
+
5978
+ á
5979
+
5980
+
5981
+
5982
+
5983
+
5984
+
5985
+
5986
+
5987
+
5988
+
5989
+
5990
+
5991
+
5992
+
5993
+
5994
+
5995
+
5996
+
5997
+
5998
+
5999
+
6000
+
6001
+
6002
+
6003
+
6004
+
6005
+
6006
+
6007
+
6008
+
6009
+
6010
+
6011
+
6012
+
6013
+
6014
+
6015
+
6016
+
6017
+
6018
+
6019
+
6020
+
6021
+
6022
+
6023
+
6024
+
6025
+
6026
+
6027
+
6028
+
6029
+
6030
+
6031
+
6032
+
6033
+
6034
+
6035
+
6036
+
6037
+
6038
+
6039
+
6040
+
6041
+
6042
+
6043
+
6044
+
6045
+
6046
+
6047
+
6048
+
6049
+
6050
+
6051
+
6052
+
6053
+
6054
+
6055
+
6056
+
6057
+
6058
+
6059
+
6060
+
6061
+
6062
+
6063
+
6064
+
6065
+
6066
+ Ó
6067
+
6068
+
6069
+
6070
+
6071
+
6072
+
6073
+
6074
+
6075
+
6076
+
6077
+
6078
+
6079
+
6080
+
6081
+
6082
+
6083
+
6084
+
6085
+
6086
+
6087
+
6088
+
6089
+
6090
+
6091
+
6092
+
6093
+
6094
+
6095
+
6096
+ ò
6097
+
6098
+
6099
+
6100
+
6101
+
6102
+
6103
+
6104
+
6105
+
6106
+
6107
+
6108
+
6109
+
6110
+
6111
+
6112
+
6113
+
6114
+
6115
+
6116
+
6117
+
6118
+
6119
+
6120
+
6121
+
6122
+
6123
+
6124
+
6125
+ 貿
6126
+
6127
+
6128
+
6129
+
6130
+
6131
+
6132
+
6133
+
6134
+
6135
+
6136
+
6137
+
6138
+ 𣇉
6139
+
6140
+
6141
+
6142
+
6143
+
6144
+
6145
+
6146
+
6147
+
6148
+
6149
+
6150
+
6151
+
6152
+
6153
+
6154
+
6155
+
6156
+
6157
+
6158
+
6159
+
6160
+
6161
+
6162
+
6163
+
6164
+
6165
+
6166
+
6167
+ г
6168
+
6169
+
6170
+
6171
+
6172
+
6173
+
6174
+
6175
+
6176
+
6177
+
6178
+
6179
+
6180
+
6181
+
6182
+
6183
+
6184
+
6185
+ ���
6186
+
6187
+
6188
+
6189
+
6190
+
6191
+ 楿
6192
+
6193
+
6194
+
6195
+
6196
+
6197
+
6198
+ 滿
6199
+
6200
+
6201
+
6202
+
6203
+
6204
+
6205
+
6206
+
6207
+
6208
+
6209
+
6210
+
6211
+
6212
+
6213
+
6214
+
6215
+
6216
+
6217
+
6218
+
6219
+
6220
+
6221
+
6222
+
6223
+
6224
+
6225
+
6226
+
6227
+
6228
+
6229
+
6230
+
6231
+
6232
+
6233
+
6234
+
6235
+
6236
+
6237
+
6238
+
6239
+
6240
+
6241
+
6242
+
6243
+
6244
+
6245
+
6246
+
6247
+
6248
+
6249
+
6250
+
6251
+
6252
+
6253
+
6254
+ Φ
6255
+
6256
+
6257
+
6258
+
6259
+
6260
+
6261
+ ε
6262
+
6263
+
6264
+
6265
+
6266
+
6267
+
6268
+
6269
+
6270
+
6271
+
6272
+
6273
+
6274
+ ü
6275
+
6276
+
6277
+
6278
+
6279
+ 調
6280
+
6281
+
6282
+
6283
+
6284
+
6285
+
6286
+
6287
+
6288
+
6289
+
6290
+
6291
+
6292
+
6293
+
6294
+
6295
+
6296
+
6297
+
6298
+
6299
+
6300
+
6301
+
6302
+
6303
+
6304
+
6305
+
6306
+
6307
+
6308
+
6309
+
6310
+
6311
+
6312
+
6313
+
6314
+
6315
+
6316
+
6317
+
6318
+
6319
+
6320
+
6321
+
6322
+
6323
+
6324
+
6325
+
6326
+ ˋ
6327
+
6328
+
6329
+ ā
6330
+
6331
+
6332
+
6333
+
6334
+
6335
+
6336
+
6337
+
6338
+
6339
+
6340
+
6341
+
6342
+
6343
+
6344
+
6345
+
6346
+
6347
+
6348
+
6349
+
6350
+
6351
+
6352
+
6353
+
6354
+
6355
+
6356
+
6357
+
6358
+
6359
+
6360
+
6361
+
6362
+
6363
+
6364
+
6365
+
6366
+
6367
+
6368
+
6369
+ ú
6370
+ ó
6371
+
6372
+
6373
+
6374
+
6375
+
6376
+
6377
+
6378
+
6379
+
6380
+
6381
+
6382
+
6383
+
6384
+
6385
+
6386
+
6387
+
6388
+
6389
+
6390
+ ē
6391
+
6392
+
6393
+
6394
+
6395
+
6396
+
6397
+
6398
+
6399
+
6400
+
6401
+
6402
+
6403
+
6404
+
6405
+
6406
+
6407
+
6408
+
6409
+
6410
+
6411
+
6412
+ Ω
6413
+
6414
+
6415
+
6416
+
6417
+
6418
+
6419
+
6420
+
6421
+
6422
+
6423
+
6424
+
6425
+
6426
+
6427
+
6428
+
6429
+
6430
+
6431
+
6432
+
6433
+
6434
+
6435
+
6436
+
6437
+ П
6438
+
6439
+
6440
+
6441
+
6442
+
6443
+
6444
+
6445
+
6446
+
6447
+
6448
+
6449
+
6450
+
6451
+
6452
+
6453
+
6454
+
6455
+
6456
+
6457
+
6458
+
6459
+
6460
+ ǐ
6461
+ ō
6462
+ ǒ
6463
+
6464
+
6465
+
6466
+ μ
6467
+
6468
+
6469
+
6470
+
6471
+
6472
+
6473
+
6474
+
6475
+ à
6476
+ ɡ
6477
+
6478
+
6479
+
6480
+
6481
+
6482
+
6483
+
6484
+
6485
+ ī
6486
+
6487
+
6488
+
6489
+
6490
+
6491
+
6492
+
6493
+
6494
+
6495
+
6496
+
6497
+
6498
+
6499
+
6500
+
6501
+
6502
+
6503
+
6504
+
6505
+
6506
+
6507
+
6508
+
6509
+
6510
+
6511
+
6512
+
6513
+
6514
+
6515
+
6516
+
6517
+
6518
+
6519
+
6520
+
6521
+
6522
+
6523
+
6524
+
6525
+
6526
+
6527
+
6528
+
6529
+
6530
+
6531
+
6532
+
6533
+
6534
+
6535
+
6536
+
6537
+
6538
+
6539
+
6540
+
6541
+ ²
6542
+
6543
+
6544
+
6545
+
6546
+
6547
+
6548
+
6549
+
6550
+
6551
+
6552
+
6553
+
6554
+
6555
+
6556
+
6557
+
6558
+
6559
+
6560
+
6561
+
6562
+
6563
+
6564
+
6565
+
6566
+
6567
+
6568
+
6569
+
6570
+
6571
+
6572
+
6573
+
6574
+
6575
+
6576
+
6577
+
6578
+
6579
+
6580
+
6581
+
6582
+ 駿
6583
+
6584
+
6585
+
6586
+
6587
+
6588
+
6589
+
6590
+
6591
+
6592
+
6593
+
6594
+
6595
+
6596
+
6597
+
6598
+
6599
+
6600
+
6601
+
6602
+
6603
+
6604
+
6605
+
6606
+
6607
+
6608
+
6609
+ θ
6610
+
6611
+
6612
+
6613
+ ū
6614
+ ì
6615
+
6616
+
6617
+
6618
+
6619
+
6620
+
6621
+
6622
+
6623
+
iopaint/model/original_sd_configs/sd_xl_base.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.13025
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
+ params:
10
+ num_idx: 1000
11
+
12
+ scaling_config:
13
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
+ discretization_config:
15
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
+
17
+ network_config:
18
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
+ params:
20
+ adm_in_channels: 2816
21
+ num_classes: sequential
22
+ use_checkpoint: True
23
+ in_channels: 4
24
+ out_channels: 4
25
+ model_channels: 320
26
+ attention_resolutions: [4, 2]
27
+ num_res_blocks: 2
28
+ channel_mult: [1, 2, 4]
29
+ num_head_channels: 64
30
+ use_linear_in_transformer: True
31
+ transformer_depth: [1, 2, 10]
32
+ context_dim: 2048
33
+ spatial_transformer_attn_type: softmax-xformers
34
+
35
+ conditioner_config:
36
+ target: sgm.modules.GeneralConditioner
37
+ params:
38
+ emb_models:
39
+ - is_trainable: False
40
+ input_key: txt
41
+ target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
42
+ params:
43
+ layer: hidden
44
+ layer_idx: 11
45
+
46
+ - is_trainable: False
47
+ input_key: txt
48
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
49
+ params:
50
+ arch: ViT-bigG-14
51
+ version: laion2b_s39b_b160k
52
+ freeze: True
53
+ layer: penultimate
54
+ always_return_pooled: True
55
+ legacy: False
56
+
57
+ - is_trainable: False
58
+ input_key: original_size_as_tuple
59
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
60
+ params:
61
+ outdim: 256
62
+
63
+ - is_trainable: False
64
+ input_key: crop_coords_top_left
65
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
66
+ params:
67
+ outdim: 256
68
+
69
+ - is_trainable: False
70
+ input_key: target_size_as_tuple
71
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
72
+ params:
73
+ outdim: 256
74
+
75
+ first_stage_config:
76
+ target: sgm.models.autoencoder.AutoencoderKL
77
+ params:
78
+ embed_dim: 4
79
+ monitor: val/rec_loss
80
+ ddconfig:
81
+ attn_type: vanilla-xformers
82
+ double_z: true
83
+ z_channels: 4
84
+ resolution: 256
85
+ in_channels: 3
86
+ out_ch: 3
87
+ ch: 128
88
+ ch_mult: [1, 2, 4, 4]
89
+ num_res_blocks: 2
90
+ attn_resolutions: []
91
+ dropout: 0.0
92
+ lossconfig:
93
+ target: torch.nn.Identity
iopaint/model/original_sd_configs/sd_xl_refiner.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.13025
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
+ params:
10
+ num_idx: 1000
11
+
12
+ scaling_config:
13
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
+ discretization_config:
15
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
+
17
+ network_config:
18
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
+ params:
20
+ adm_in_channels: 2560
21
+ num_classes: sequential
22
+ use_checkpoint: True
23
+ in_channels: 4
24
+ out_channels: 4
25
+ model_channels: 384
26
+ attention_resolutions: [4, 2]
27
+ num_res_blocks: 2
28
+ channel_mult: [1, 2, 4, 4]
29
+ num_head_channels: 64
30
+ use_linear_in_transformer: True
31
+ transformer_depth: 4
32
+ context_dim: [1280, 1280, 1280, 1280]
33
+ spatial_transformer_attn_type: softmax-xformers
34
+
35
+ conditioner_config:
36
+ target: sgm.modules.GeneralConditioner
37
+ params:
38
+ emb_models:
39
+ - is_trainable: False
40
+ input_key: txt
41
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
42
+ params:
43
+ arch: ViT-bigG-14
44
+ version: laion2b_s39b_b160k
45
+ legacy: False
46
+ freeze: True
47
+ layer: penultimate
48
+ always_return_pooled: True
49
+
50
+ - is_trainable: False
51
+ input_key: original_size_as_tuple
52
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
53
+ params:
54
+ outdim: 256
55
+
56
+ - is_trainable: False
57
+ input_key: crop_coords_top_left
58
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
59
+ params:
60
+ outdim: 256
61
+
62
+ - is_trainable: False
63
+ input_key: aesthetic_score
64
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
65
+ params:
66
+ outdim: 256
67
+
68
+ first_stage_config:
69
+ target: sgm.models.autoencoder.AutoencoderKL
70
+ params:
71
+ embed_dim: 4
72
+ monitor: val/rec_loss
73
+ ddconfig:
74
+ attn_type: vanilla-xformers
75
+ double_z: true
76
+ z_channels: 4
77
+ resolution: 256
78
+ in_channels: 3
79
+ out_ch: 3
80
+ ch: 128
81
+ ch_mult: [1, 2, 4, 4]
82
+ num_res_blocks: 2
83
+ attn_resolutions: []
84
+ dropout: 0.0
85
+ lossconfig:
86
+ target: torch.nn.Identity
iopaint/model/paint_by_example.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PIL
2
+ import PIL.Image
3
+ import cv2
4
+ import torch
5
+ from loguru import logger
6
+
7
+ from iopaint.helper import decode_base64_to_image
8
+ from .base import DiffusionInpaintModel
9
+ from iopaint.schema import InpaintRequest
10
+ from .utils import get_torch_dtype, enable_low_mem, is_local_files_only
11
+
12
+
13
+ class PaintByExample(DiffusionInpaintModel):
14
+ name = "Fantasy-Studio/Paint-by-Example"
15
+ pad_mod = 8
16
+ min_size = 512
17
+
18
+ def init_model(self, device: torch.device, **kwargs):
19
+ from diffusers import DiffusionPipeline
20
+
21
+ use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False))
22
+ model_kwargs = {
23
+ "local_files_only": is_local_files_only(**kwargs),
24
+ }
25
+
26
+ if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
27
+ logger.info("Disable Paint By Example Model NSFW checker")
28
+ model_kwargs.update(
29
+ dict(safety_checker=None, requires_safety_checker=False)
30
+ )
31
+
32
+ self.model = DiffusionPipeline.from_pretrained(
33
+ self.name, torch_dtype=torch_dtype, **model_kwargs
34
+ )
35
+ enable_low_mem(self.model, kwargs.get("low_mem", False))
36
+
37
+ # TODO: gpu_id
38
+ if kwargs.get("cpu_offload", False) and use_gpu:
39
+ self.model.image_encoder = self.model.image_encoder.to(device)
40
+ self.model.enable_sequential_cpu_offload(gpu_id=0)
41
+ else:
42
+ self.model = self.model.to(device)
43
+
44
+ def forward(self, image, mask, config: InpaintRequest):
45
+ """Input image and output image have same size
46
+ image: [H, W, C] RGB
47
+ mask: [H, W, 1] 255 means area to repaint
48
+ return: BGR IMAGE
49
+ """
50
+ if config.paint_by_example_example_image is None:
51
+ raise ValueError("paint_by_example_example_image is required")
52
+ example_image, _, _ = decode_base64_to_image(
53
+ config.paint_by_example_example_image
54
+ )
55
+ output = self.model(
56
+ image=PIL.Image.fromarray(image),
57
+ mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
58
+ example_image=PIL.Image.fromarray(example_image),
59
+ num_inference_steps=config.sd_steps,
60
+ guidance_scale=config.sd_guidance_scale,
61
+ negative_prompt="out of frame, lowres, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, disfigured, gross proportions, malformed limbs, watermark, signature",
62
+ output_type="np.array",
63
+ generator=torch.manual_seed(config.sd_seed),
64
+ ).images[0]
65
+
66
+ output = (output * 255).round().astype("uint8")
67
+ output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
68
+ return output
iopaint/model/plms_sampler.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From: https://github.com/CompVis/latent-diffusion/blob/main/ldm/models/diffusion/plms.py
2
+ import torch
3
+ import numpy as np
4
+ from .utils import make_ddim_timesteps, make_ddim_sampling_parameters, noise_like
5
+ from tqdm import tqdm
6
+
7
+
8
+ class PLMSSampler(object):
9
+ def __init__(self, model, schedule="linear", **kwargs):
10
+ super().__init__()
11
+ self.model = model
12
+ self.ddpm_num_timesteps = model.num_timesteps
13
+ self.schedule = schedule
14
+
15
+ def register_buffer(self, name, attr):
16
+ setattr(self, name, attr)
17
+
18
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
19
+ if ddim_eta != 0:
20
+ raise ValueError('ddim_eta must be 0 for PLMS')
21
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
22
+ num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
23
+ alphas_cumprod = self.model.alphas_cumprod
24
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
25
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
26
+
27
+ self.register_buffer('betas', to_torch(self.model.betas))
28
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
29
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
30
+
31
+ # calculations for diffusion q(x_t | x_{t-1}) and others
32
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
33
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
34
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
35
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
36
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
37
+
38
+ # ddim sampling parameters
39
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
40
+ ddim_timesteps=self.ddim_timesteps,
41
+ eta=ddim_eta, verbose=verbose)
42
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
43
+ self.register_buffer('ddim_alphas', ddim_alphas)
44
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
45
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
46
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
47
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
48
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
49
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
50
+
51
+ @torch.no_grad()
52
+ def sample(self,
53
+ steps,
54
+ batch_size,
55
+ shape,
56
+ conditioning=None,
57
+ callback=None,
58
+ normals_sequence=None,
59
+ img_callback=None,
60
+ quantize_x0=False,
61
+ eta=0.,
62
+ mask=None,
63
+ x0=None,
64
+ temperature=1.,
65
+ noise_dropout=0.,
66
+ score_corrector=None,
67
+ corrector_kwargs=None,
68
+ verbose=False,
69
+ x_T=None,
70
+ log_every_t=100,
71
+ unconditional_guidance_scale=1.,
72
+ unconditional_conditioning=None,
73
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
74
+ **kwargs
75
+ ):
76
+ if conditioning is not None:
77
+ if isinstance(conditioning, dict):
78
+ cbs = conditioning[list(conditioning.keys())[0]].shape[0]
79
+ if cbs != batch_size:
80
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
81
+ else:
82
+ if conditioning.shape[0] != batch_size:
83
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
84
+
85
+ self.make_schedule(ddim_num_steps=steps, ddim_eta=eta, verbose=verbose)
86
+ # sampling
87
+ C, H, W = shape
88
+ size = (batch_size, C, H, W)
89
+ print(f'Data shape for PLMS sampling is {size}')
90
+
91
+ samples = self.plms_sampling(conditioning, size,
92
+ callback=callback,
93
+ img_callback=img_callback,
94
+ quantize_denoised=quantize_x0,
95
+ mask=mask, x0=x0,
96
+ ddim_use_original_steps=False,
97
+ noise_dropout=noise_dropout,
98
+ temperature=temperature,
99
+ score_corrector=score_corrector,
100
+ corrector_kwargs=corrector_kwargs,
101
+ x_T=x_T,
102
+ log_every_t=log_every_t,
103
+ unconditional_guidance_scale=unconditional_guidance_scale,
104
+ unconditional_conditioning=unconditional_conditioning,
105
+ )
106
+ return samples
107
+
108
+ @torch.no_grad()
109
+ def plms_sampling(self, cond, shape,
110
+ x_T=None, ddim_use_original_steps=False,
111
+ callback=None, timesteps=None, quantize_denoised=False,
112
+ mask=None, x0=None, img_callback=None, log_every_t=100,
113
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
114
+ unconditional_guidance_scale=1., unconditional_conditioning=None, ):
115
+ device = self.model.betas.device
116
+ b = shape[0]
117
+ if x_T is None:
118
+ img = torch.randn(shape, device=device)
119
+ else:
120
+ img = x_T
121
+
122
+ if timesteps is None:
123
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
124
+ elif timesteps is not None and not ddim_use_original_steps:
125
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
126
+ timesteps = self.ddim_timesteps[:subset_end]
127
+
128
+ time_range = list(reversed(range(0, timesteps))) if ddim_use_original_steps else np.flip(timesteps)
129
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
130
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
131
+
132
+ iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
133
+ old_eps = []
134
+
135
+ for i, step in enumerate(iterator):
136
+ index = total_steps - i - 1
137
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
138
+ ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
139
+
140
+ if mask is not None:
141
+ assert x0 is not None
142
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
143
+ img = img_orig * mask + (1. - mask) * img
144
+
145
+ outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
146
+ quantize_denoised=quantize_denoised, temperature=temperature,
147
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
148
+ corrector_kwargs=corrector_kwargs,
149
+ unconditional_guidance_scale=unconditional_guidance_scale,
150
+ unconditional_conditioning=unconditional_conditioning,
151
+ old_eps=old_eps, t_next=ts_next)
152
+ img, pred_x0, e_t = outs
153
+ old_eps.append(e_t)
154
+ if len(old_eps) >= 4:
155
+ old_eps.pop(0)
156
+ if callback: callback(i)
157
+ if img_callback: img_callback(pred_x0, i)
158
+
159
+ return img
160
+
161
+ @torch.no_grad()
162
+ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
163
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
164
+ unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
165
+ b, *_, device = *x.shape, x.device
166
+
167
+ def get_model_output(x, t):
168
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
169
+ e_t = self.model.apply_model(x, t, c)
170
+ else:
171
+ x_in = torch.cat([x] * 2)
172
+ t_in = torch.cat([t] * 2)
173
+ c_in = torch.cat([unconditional_conditioning, c])
174
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
175
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
176
+
177
+ if score_corrector is not None:
178
+ assert self.model.parameterization == "eps"
179
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
180
+
181
+ return e_t
182
+
183
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
184
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
185
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
186
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
187
+
188
+ def get_x_prev_and_pred_x0(e_t, index):
189
+ # select parameters corresponding to the currently considered timestep
190
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
191
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
192
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
193
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
194
+
195
+ # current prediction for x_0
196
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
197
+ if quantize_denoised:
198
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
199
+ # direction pointing to x_t
200
+ dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
201
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
202
+ if noise_dropout > 0.:
203
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
204
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
205
+ return x_prev, pred_x0
206
+
207
+ e_t = get_model_output(x, t)
208
+ if len(old_eps) == 0:
209
+ # Pseudo Improved Euler (2nd order)
210
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
211
+ e_t_next = get_model_output(x_prev, t_next)
212
+ e_t_prime = (e_t + e_t_next) / 2
213
+ elif len(old_eps) == 1:
214
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
215
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
216
+ elif len(old_eps) == 2:
217
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
218
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
219
+ elif len(old_eps) >= 3:
220
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
221
+ e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
222
+
223
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
224
+
225
+ return x_prev, pred_x0, e_t
iopaint/model/power_paint/pipeline_powerpaint.py ADDED
@@ -0,0 +1,1243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import PIL
20
+ import torch
21
+ from packaging import version
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
+ from diffusers.configuration_utils import FrozenDict
24
+ from diffusers.image_processor import VaeImageProcessor
25
+ from diffusers.loaders import (
26
+ FromSingleFileMixin,
27
+ LoraLoaderMixin,
28
+ TextualInversionLoaderMixin,
29
+ )
30
+ from diffusers.models import (
31
+ AsymmetricAutoencoderKL,
32
+ AutoencoderKL,
33
+ UNet2DConditionModel,
34
+ )
35
+ from diffusers.schedulers import KarrasDiffusionSchedulers
36
+ from diffusers.utils import (
37
+ deprecate,
38
+ is_accelerate_available,
39
+ is_accelerate_version,
40
+ logging,
41
+ )
42
+ from diffusers.utils.torch_utils import randn_tensor
43
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
44
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
45
+ from diffusers.pipelines.stable_diffusion.safety_checker import (
46
+ StableDiffusionSafetyChecker,
47
+ )
48
+
49
+
50
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
51
+
52
+
53
+ def prepare_mask_and_masked_image(
54
+ image, mask, height, width, return_image: bool = False
55
+ ):
56
+ """
57
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
58
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
59
+ ``image`` and ``1`` for the ``mask``.
60
+
61
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
62
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
63
+
64
+ Args:
65
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
66
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
67
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
68
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
69
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
70
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
71
+
72
+
73
+ Raises:
74
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
75
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
76
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
77
+ (ot the other way around).
78
+
79
+ Returns:
80
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
81
+ dimensions: ``batch x channels x height x width``.
82
+ """
83
+
84
+ if image is None:
85
+ raise ValueError("`image` input cannot be undefined.")
86
+
87
+ if mask is None:
88
+ raise ValueError("`mask_image` input cannot be undefined.")
89
+
90
+ if isinstance(image, torch.Tensor):
91
+ if not isinstance(mask, torch.Tensor):
92
+ raise TypeError(
93
+ f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not"
94
+ )
95
+
96
+ # Batch single image
97
+ if image.ndim == 3:
98
+ assert (
99
+ image.shape[0] == 3
100
+ ), "Image outside a batch should be of shape (3, H, W)"
101
+ image = image.unsqueeze(0)
102
+
103
+ # Batch and add channel dim for single mask
104
+ if mask.ndim == 2:
105
+ mask = mask.unsqueeze(0).unsqueeze(0)
106
+
107
+ # Batch single mask or add channel dim
108
+ if mask.ndim == 3:
109
+ # Single batched mask, no channel dim or single mask not batched but channel dim
110
+ if mask.shape[0] == 1:
111
+ mask = mask.unsqueeze(0)
112
+
113
+ # Batched masks no channel dim
114
+ else:
115
+ mask = mask.unsqueeze(1)
116
+
117
+ assert (
118
+ image.ndim == 4 and mask.ndim == 4
119
+ ), "Image and Mask must have 4 dimensions"
120
+ assert (
121
+ image.shape[-2:] == mask.shape[-2:]
122
+ ), "Image and Mask must have the same spatial dimensions"
123
+ assert (
124
+ image.shape[0] == mask.shape[0]
125
+ ), "Image and Mask must have the same batch size"
126
+
127
+ # Check image is in [-1, 1]
128
+ if image.min() < -1 or image.max() > 1:
129
+ raise ValueError("Image should be in [-1, 1] range")
130
+
131
+ # Check mask is in [0, 1]
132
+ if mask.min() < 0 or mask.max() > 1:
133
+ raise ValueError("Mask should be in [0, 1] range")
134
+
135
+ # Binarize mask
136
+ mask[mask < 0.5] = 0
137
+ mask[mask >= 0.5] = 1
138
+
139
+ # Image as float32
140
+ image = image.to(dtype=torch.float32)
141
+ elif isinstance(mask, torch.Tensor):
142
+ raise TypeError(
143
+ f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not"
144
+ )
145
+ else:
146
+ # preprocess image
147
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
148
+ image = [image]
149
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
150
+ # resize all images w.r.t passed height an width
151
+ image = [
152
+ i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image
153
+ ]
154
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
155
+ image = np.concatenate(image, axis=0)
156
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
157
+ image = np.concatenate([i[None, :] for i in image], axis=0)
158
+
159
+ image = image.transpose(0, 3, 1, 2)
160
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
161
+
162
+ # preprocess mask
163
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
164
+ mask = [mask]
165
+
166
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
167
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
168
+ mask = np.concatenate(
169
+ [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
170
+ )
171
+ mask = mask.astype(np.float32) / 255.0
172
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
173
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
174
+
175
+ mask[mask < 0.5] = 0
176
+ mask[mask >= 0.5] = 1
177
+ mask = torch.from_numpy(mask)
178
+
179
+ masked_image = image * (mask < 0.5)
180
+
181
+ # n.b. ensure backwards compatibility as old function does not return image
182
+ if return_image:
183
+ return mask, masked_image, image
184
+
185
+ return mask, masked_image
186
+
187
+
188
+ class StableDiffusionInpaintPipeline(
189
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
190
+ ):
191
+ r"""
192
+ Pipeline for text-guided image inpainting using Stable Diffusion.
193
+
194
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
195
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
196
+
197
+ The pipeline also inherits the following loading methods:
198
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
199
+ - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
200
+ - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
201
+
202
+ Args:
203
+ vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]):
204
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
205
+ text_encoder ([`CLIPTextModel`]):
206
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
207
+ tokenizer ([`~transformers.CLIPTokenizer`]):
208
+ A `CLIPTokenizer` to tokenize text.
209
+ unet ([`UNet2DConditionModel`]):
210
+ A `UNet2DConditionModel` to denoise the encoded image latents.
211
+ scheduler ([`SchedulerMixin`]):
212
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
213
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
214
+ safety_checker ([`StableDiffusionSafetyChecker`]):
215
+ Classification module that estimates whether generated images could be considered offensive or harmful.
216
+ Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
217
+ about a model's potential harms.
218
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
219
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
220
+ """
221
+ _optional_components = ["safety_checker", "feature_extractor"]
222
+
223
+ def __init__(
224
+ self,
225
+ vae: Union[AutoencoderKL, AsymmetricAutoencoderKL],
226
+ text_encoder: CLIPTextModel,
227
+ tokenizer: CLIPTokenizer,
228
+ unet: UNet2DConditionModel,
229
+ scheduler: KarrasDiffusionSchedulers,
230
+ safety_checker: StableDiffusionSafetyChecker,
231
+ feature_extractor: CLIPImageProcessor,
232
+ requires_safety_checker: bool = True,
233
+ ):
234
+ super().__init__()
235
+
236
+ if (
237
+ hasattr(scheduler.config, "steps_offset")
238
+ and scheduler.config.steps_offset != 1
239
+ ):
240
+ deprecation_message = (
241
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
242
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
243
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
244
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
245
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
246
+ " file"
247
+ )
248
+ deprecate(
249
+ "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
250
+ )
251
+ new_config = dict(scheduler.config)
252
+ new_config["steps_offset"] = 1
253
+ scheduler._internal_dict = FrozenDict(new_config)
254
+
255
+ if (
256
+ hasattr(scheduler.config, "skip_prk_steps")
257
+ and scheduler.config.skip_prk_steps is False
258
+ ):
259
+ deprecation_message = (
260
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration"
261
+ " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
262
+ " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
263
+ " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
264
+ " Hub, it would be very nice if you could open a Pull request for the"
265
+ " `scheduler/scheduler_config.json` file"
266
+ )
267
+ deprecate(
268
+ "skip_prk_steps not set",
269
+ "1.0.0",
270
+ deprecation_message,
271
+ standard_warn=False,
272
+ )
273
+ new_config = dict(scheduler.config)
274
+ new_config["skip_prk_steps"] = True
275
+ scheduler._internal_dict = FrozenDict(new_config)
276
+
277
+ if safety_checker is None and requires_safety_checker:
278
+ logger.warning(
279
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
280
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
281
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
282
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
283
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
284
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
285
+ )
286
+
287
+ if safety_checker is not None and feature_extractor is None:
288
+ raise ValueError(
289
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
290
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
291
+ )
292
+
293
+ is_unet_version_less_0_9_0 = hasattr(
294
+ unet.config, "_diffusers_version"
295
+ ) and version.parse(
296
+ version.parse(unet.config._diffusers_version).base_version
297
+ ) < version.parse(
298
+ "0.9.0.dev0"
299
+ )
300
+ is_unet_sample_size_less_64 = (
301
+ hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
302
+ )
303
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
304
+ deprecation_message = (
305
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
306
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
307
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
308
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
309
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
310
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
311
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
312
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
313
+ " the `unet/config.json` file"
314
+ )
315
+ deprecate(
316
+ "sample_size<64", "1.0.0", deprecation_message, standard_warn=False
317
+ )
318
+ new_config = dict(unet.config)
319
+ new_config["sample_size"] = 64
320
+ unet._internal_dict = FrozenDict(new_config)
321
+
322
+ # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
323
+ if unet.config.in_channels != 9:
324
+ logger.info(
325
+ f"You have loaded a UNet with {unet.config.in_channels} input channels which."
326
+ )
327
+
328
+ self.register_modules(
329
+ vae=vae,
330
+ text_encoder=text_encoder,
331
+ tokenizer=tokenizer,
332
+ unet=unet,
333
+ scheduler=scheduler,
334
+ safety_checker=safety_checker,
335
+ feature_extractor=feature_extractor,
336
+ )
337
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
338
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
339
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
340
+
341
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
342
+ def enable_model_cpu_offload(self, gpu_id=0):
343
+ r"""
344
+ Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
345
+ time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
346
+ Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
347
+ iterative execution of the `unet`.
348
+ """
349
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
350
+ from accelerate import cpu_offload_with_hook
351
+ else:
352
+ raise ImportError(
353
+ "`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher."
354
+ )
355
+
356
+ device = torch.device(f"cuda:{gpu_id}")
357
+
358
+ if self.device.type != "cpu":
359
+ self.to("cpu", silence_dtype_warnings=True)
360
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
361
+
362
+ hook = None
363
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
364
+ _, hook = cpu_offload_with_hook(
365
+ cpu_offloaded_model, device, prev_module_hook=hook
366
+ )
367
+
368
+ if self.safety_checker is not None:
369
+ _, hook = cpu_offload_with_hook(
370
+ self.safety_checker, device, prev_module_hook=hook
371
+ )
372
+
373
+ # We'll offload the last model manually.
374
+ self.final_offload_hook = hook
375
+
376
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
377
+ def _encode_prompt(
378
+ self,
379
+ promptA,
380
+ promptB,
381
+ t,
382
+ device,
383
+ num_images_per_prompt,
384
+ do_classifier_free_guidance,
385
+ negative_promptA=None,
386
+ negative_promptB=None,
387
+ t_nag=None,
388
+ prompt_embeds: Optional[torch.FloatTensor] = None,
389
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
390
+ lora_scale: Optional[float] = None,
391
+ ):
392
+ r"""
393
+ Encodes the prompt into text encoder hidden states.
394
+
395
+ Args:
396
+ prompt (`str` or `List[str]`, *optional*):
397
+ prompt to be encoded
398
+ device: (`torch.device`):
399
+ torch device
400
+ num_images_per_prompt (`int`):
401
+ number of images that should be generated per prompt
402
+ do_classifier_free_guidance (`bool`):
403
+ whether to use classifier free guidance or not
404
+ negative_prompt (`str` or `List[str]`, *optional*):
405
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
406
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
407
+ less than `1`).
408
+ prompt_embeds (`torch.FloatTensor`, *optional*):
409
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
410
+ provided, text embeddings will be generated from `prompt` input argument.
411
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
412
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
413
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
414
+ argument.
415
+ lora_scale (`float`, *optional*):
416
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
417
+ """
418
+ # set lora scale so that monkey patched LoRA
419
+ # function of text encoder can correctly access it
420
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
421
+ self._lora_scale = lora_scale
422
+
423
+ prompt = promptA
424
+ negative_prompt = negative_promptA
425
+
426
+ if promptA is not None and isinstance(promptA, str):
427
+ batch_size = 1
428
+ elif promptA is not None and isinstance(promptA, list):
429
+ batch_size = len(promptA)
430
+ else:
431
+ batch_size = prompt_embeds.shape[0]
432
+
433
+ if prompt_embeds is None:
434
+ # textual inversion: procecss multi-vector tokens if necessary
435
+ if isinstance(self, TextualInversionLoaderMixin):
436
+ promptA = self.maybe_convert_prompt(promptA, self.tokenizer)
437
+
438
+ text_inputsA = self.tokenizer(
439
+ promptA,
440
+ padding="max_length",
441
+ max_length=self.tokenizer.model_max_length,
442
+ truncation=True,
443
+ return_tensors="pt",
444
+ )
445
+ text_inputsB = self.tokenizer(
446
+ promptB,
447
+ padding="max_length",
448
+ max_length=self.tokenizer.model_max_length,
449
+ truncation=True,
450
+ return_tensors="pt",
451
+ )
452
+ text_input_idsA = text_inputsA.input_ids
453
+ text_input_idsB = text_inputsB.input_ids
454
+ untruncated_ids = self.tokenizer(
455
+ promptA, padding="longest", return_tensors="pt"
456
+ ).input_ids
457
+
458
+ if untruncated_ids.shape[-1] >= text_input_idsA.shape[
459
+ -1
460
+ ] and not torch.equal(text_input_idsA, untruncated_ids):
461
+ removed_text = self.tokenizer.batch_decode(
462
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
463
+ )
464
+ logger.warning(
465
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
466
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
467
+ )
468
+
469
+ if (
470
+ hasattr(self.text_encoder.config, "use_attention_mask")
471
+ and self.text_encoder.config.use_attention_mask
472
+ ):
473
+ attention_mask = text_inputsA.attention_mask.to(device)
474
+ else:
475
+ attention_mask = None
476
+
477
+ # print("text_input_idsA: ",text_input_idsA)
478
+ # print("text_input_idsB: ",text_input_idsB)
479
+ # print('t: ',t)
480
+
481
+ prompt_embedsA = self.text_encoder(
482
+ text_input_idsA.to(device),
483
+ attention_mask=attention_mask,
484
+ )
485
+ prompt_embedsA = prompt_embedsA[0]
486
+
487
+ prompt_embedsB = self.text_encoder(
488
+ text_input_idsB.to(device),
489
+ attention_mask=attention_mask,
490
+ )
491
+ prompt_embedsB = prompt_embedsB[0]
492
+ prompt_embeds = prompt_embedsA * (t) + (1 - t) * prompt_embedsB
493
+ # print("prompt_embeds: ",prompt_embeds)
494
+
495
+ if self.text_encoder is not None:
496
+ prompt_embeds_dtype = self.text_encoder.dtype
497
+ elif self.unet is not None:
498
+ prompt_embeds_dtype = self.unet.dtype
499
+ else:
500
+ prompt_embeds_dtype = prompt_embeds.dtype
501
+
502
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
503
+
504
+ bs_embed, seq_len, _ = prompt_embeds.shape
505
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
506
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
507
+ prompt_embeds = prompt_embeds.view(
508
+ bs_embed * num_images_per_prompt, seq_len, -1
509
+ )
510
+
511
+ # get unconditional embeddings for classifier free guidance
512
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
513
+ uncond_tokensA: List[str]
514
+ uncond_tokensB: List[str]
515
+ if negative_prompt is None:
516
+ uncond_tokensA = [""] * batch_size
517
+ uncond_tokensB = [""] * batch_size
518
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
519
+ raise TypeError(
520
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
521
+ f" {type(prompt)}."
522
+ )
523
+ elif isinstance(negative_prompt, str):
524
+ uncond_tokensA = [negative_promptA]
525
+ uncond_tokensB = [negative_promptB]
526
+ elif batch_size != len(negative_prompt):
527
+ raise ValueError(
528
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
529
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
530
+ " the batch size of `prompt`."
531
+ )
532
+ else:
533
+ uncond_tokensA = negative_promptA
534
+ uncond_tokensB = negative_promptB
535
+
536
+ # textual inversion: procecss multi-vector tokens if necessary
537
+ if isinstance(self, TextualInversionLoaderMixin):
538
+ uncond_tokensA = self.maybe_convert_prompt(
539
+ uncond_tokensA, self.tokenizer
540
+ )
541
+ uncond_tokensB = self.maybe_convert_prompt(
542
+ uncond_tokensB, self.tokenizer
543
+ )
544
+
545
+ max_length = prompt_embeds.shape[1]
546
+ uncond_inputA = self.tokenizer(
547
+ uncond_tokensA,
548
+ padding="max_length",
549
+ max_length=max_length,
550
+ truncation=True,
551
+ return_tensors="pt",
552
+ )
553
+ uncond_inputB = self.tokenizer(
554
+ uncond_tokensB,
555
+ padding="max_length",
556
+ max_length=max_length,
557
+ truncation=True,
558
+ return_tensors="pt",
559
+ )
560
+
561
+ if (
562
+ hasattr(self.text_encoder.config, "use_attention_mask")
563
+ and self.text_encoder.config.use_attention_mask
564
+ ):
565
+ attention_mask = uncond_inputA.attention_mask.to(device)
566
+ else:
567
+ attention_mask = None
568
+
569
+ negative_prompt_embedsA = self.text_encoder(
570
+ uncond_inputA.input_ids.to(device),
571
+ attention_mask=attention_mask,
572
+ )
573
+ negative_prompt_embedsB = self.text_encoder(
574
+ uncond_inputB.input_ids.to(device),
575
+ attention_mask=attention_mask,
576
+ )
577
+ negative_prompt_embeds = (
578
+ negative_prompt_embedsA[0] * (t_nag)
579
+ + (1 - t_nag) * negative_prompt_embedsB[0]
580
+ )
581
+
582
+ # negative_prompt_embeds = negative_prompt_embeds[0]
583
+
584
+ if do_classifier_free_guidance:
585
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
586
+ seq_len = negative_prompt_embeds.shape[1]
587
+
588
+ negative_prompt_embeds = negative_prompt_embeds.to(
589
+ dtype=prompt_embeds_dtype, device=device
590
+ )
591
+
592
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
593
+ 1, num_images_per_prompt, 1
594
+ )
595
+ negative_prompt_embeds = negative_prompt_embeds.view(
596
+ batch_size * num_images_per_prompt, seq_len, -1
597
+ )
598
+
599
+ # For classifier free guidance, we need to do two forward passes.
600
+ # Here we concatenate the unconditional and text embeddings into a single batch
601
+ # to avoid doing two forward passes
602
+ # print("prompt_embeds: ",prompt_embeds)
603
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
604
+
605
+ return prompt_embeds
606
+
607
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
608
+ def run_safety_checker(self, image, device, dtype):
609
+ if self.safety_checker is None:
610
+ has_nsfw_concept = None
611
+ else:
612
+ if torch.is_tensor(image):
613
+ feature_extractor_input = self.image_processor.postprocess(
614
+ image, output_type="pil"
615
+ )
616
+ else:
617
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
618
+ safety_checker_input = self.feature_extractor(
619
+ feature_extractor_input, return_tensors="pt"
620
+ ).to(device)
621
+ image, has_nsfw_concept = self.safety_checker(
622
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
623
+ )
624
+ return image, has_nsfw_concept
625
+
626
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
627
+ def prepare_extra_step_kwargs(self, generator, eta):
628
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
629
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
630
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
631
+ # and should be between [0, 1]
632
+
633
+ accepts_eta = "eta" in set(
634
+ inspect.signature(self.scheduler.step).parameters.keys()
635
+ )
636
+ extra_step_kwargs = {}
637
+ if accepts_eta:
638
+ extra_step_kwargs["eta"] = eta
639
+
640
+ # check if the scheduler accepts generator
641
+ accepts_generator = "generator" in set(
642
+ inspect.signature(self.scheduler.step).parameters.keys()
643
+ )
644
+ if accepts_generator:
645
+ extra_step_kwargs["generator"] = generator
646
+ return extra_step_kwargs
647
+
648
+ def check_inputs(
649
+ self,
650
+ prompt,
651
+ height,
652
+ width,
653
+ strength,
654
+ callback_steps,
655
+ negative_prompt=None,
656
+ prompt_embeds=None,
657
+ negative_prompt_embeds=None,
658
+ ):
659
+ if strength < 0 or strength > 1:
660
+ raise ValueError(
661
+ f"The value of strength should in [0.0, 1.0] but is {strength}"
662
+ )
663
+
664
+ if height % 8 != 0 or width % 8 != 0:
665
+ raise ValueError(
666
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
667
+ )
668
+
669
+ if (callback_steps is None) or (
670
+ callback_steps is not None
671
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
672
+ ):
673
+ raise ValueError(
674
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
675
+ f" {type(callback_steps)}."
676
+ )
677
+
678
+ if prompt is not None and prompt_embeds is not None:
679
+ raise ValueError(
680
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
681
+ " only forward one of the two."
682
+ )
683
+ elif prompt is None and prompt_embeds is None:
684
+ raise ValueError(
685
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
686
+ )
687
+ elif prompt is not None and (
688
+ not isinstance(prompt, str) and not isinstance(prompt, list)
689
+ ):
690
+ raise ValueError(
691
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
692
+ )
693
+
694
+ if negative_prompt is not None and negative_prompt_embeds is not None:
695
+ raise ValueError(
696
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
697
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
698
+ )
699
+
700
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
701
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
702
+ raise ValueError(
703
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
704
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
705
+ f" {negative_prompt_embeds.shape}."
706
+ )
707
+
708
+ def prepare_latents(
709
+ self,
710
+ batch_size,
711
+ num_channels_latents,
712
+ height,
713
+ width,
714
+ dtype,
715
+ device,
716
+ generator,
717
+ latents=None,
718
+ image=None,
719
+ timestep=None,
720
+ is_strength_max=True,
721
+ return_noise=False,
722
+ return_image_latents=False,
723
+ ):
724
+ shape = (
725
+ batch_size,
726
+ num_channels_latents,
727
+ height // self.vae_scale_factor,
728
+ width // self.vae_scale_factor,
729
+ )
730
+ if isinstance(generator, list) and len(generator) != batch_size:
731
+ raise ValueError(
732
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
733
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
734
+ )
735
+
736
+ if (image is None or timestep is None) and not is_strength_max:
737
+ raise ValueError(
738
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
739
+ "However, either the image or the noise timestep has not been provided."
740
+ )
741
+
742
+ if return_image_latents or (latents is None and not is_strength_max):
743
+ image = image.to(device=device, dtype=dtype)
744
+ image_latents = self._encode_vae_image(image=image, generator=generator)
745
+
746
+ if latents is None:
747
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
748
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
749
+ latents = (
750
+ noise
751
+ if is_strength_max
752
+ else self.scheduler.add_noise(image_latents, noise, timestep)
753
+ )
754
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
755
+ latents = (
756
+ latents * self.scheduler.init_noise_sigma
757
+ if is_strength_max
758
+ else latents
759
+ )
760
+ else:
761
+ noise = latents.to(device)
762
+ latents = noise * self.scheduler.init_noise_sigma
763
+
764
+ outputs = (latents,)
765
+
766
+ if return_noise:
767
+ outputs += (noise,)
768
+
769
+ if return_image_latents:
770
+ outputs += (image_latents,)
771
+
772
+ return outputs
773
+
774
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
775
+ if isinstance(generator, list):
776
+ image_latents = [
777
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(
778
+ generator=generator[i]
779
+ )
780
+ for i in range(image.shape[0])
781
+ ]
782
+ image_latents = torch.cat(image_latents, dim=0)
783
+ else:
784
+ image_latents = self.vae.encode(image).latent_dist.sample(
785
+ generator=generator
786
+ )
787
+
788
+ image_latents = self.vae.config.scaling_factor * image_latents
789
+
790
+ return image_latents
791
+
792
+ def prepare_mask_latents(
793
+ self,
794
+ mask,
795
+ masked_image,
796
+ batch_size,
797
+ height,
798
+ width,
799
+ dtype,
800
+ device,
801
+ generator,
802
+ do_classifier_free_guidance,
803
+ ):
804
+ # resize the mask to latents shape as we concatenate the mask to the latents
805
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
806
+ # and half precision
807
+ mask = torch.nn.functional.interpolate(
808
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
809
+ )
810
+ mask = mask.to(device=device, dtype=dtype)
811
+
812
+ masked_image = masked_image.to(device=device, dtype=dtype)
813
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
814
+
815
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
816
+ if mask.shape[0] < batch_size:
817
+ if not batch_size % mask.shape[0] == 0:
818
+ raise ValueError(
819
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
820
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
821
+ " of masks that you pass is divisible by the total requested batch size."
822
+ )
823
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
824
+ if masked_image_latents.shape[0] < batch_size:
825
+ if not batch_size % masked_image_latents.shape[0] == 0:
826
+ raise ValueError(
827
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
828
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
829
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
830
+ )
831
+ masked_image_latents = masked_image_latents.repeat(
832
+ batch_size // masked_image_latents.shape[0], 1, 1, 1
833
+ )
834
+
835
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
836
+ masked_image_latents = (
837
+ torch.cat([masked_image_latents] * 2)
838
+ if do_classifier_free_guidance
839
+ else masked_image_latents
840
+ )
841
+
842
+ # aligning device to prevent device errors when concating it with the latent model input
843
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
844
+ return mask, masked_image_latents
845
+
846
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
847
+ def get_timesteps(self, num_inference_steps, strength, device):
848
+ # get the original timestep using init_timestep
849
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
850
+
851
+ t_start = max(num_inference_steps - init_timestep, 0)
852
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
853
+
854
+ return timesteps, num_inference_steps - t_start
855
+
856
+ @torch.no_grad()
857
+ def __call__(
858
+ self,
859
+ promptA: Union[str, List[str]] = None,
860
+ promptB: Union[str, List[str]] = None,
861
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
862
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
863
+ height: Optional[int] = None,
864
+ width: Optional[int] = None,
865
+ strength: float = 1.0,
866
+ tradoff: float = 1.0,
867
+ tradoff_nag: float = 1.0,
868
+ num_inference_steps: int = 50,
869
+ guidance_scale: float = 7.5,
870
+ negative_promptA: Optional[Union[str, List[str]]] = None,
871
+ negative_promptB: Optional[Union[str, List[str]]] = None,
872
+ num_images_per_prompt: Optional[int] = 1,
873
+ eta: float = 0.0,
874
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
875
+ latents: Optional[torch.FloatTensor] = None,
876
+ prompt_embeds: Optional[torch.FloatTensor] = None,
877
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
878
+ output_type: Optional[str] = "pil",
879
+ return_dict: bool = True,
880
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
881
+ callback_steps: int = 1,
882
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
883
+ task_class: Union[torch.Tensor, float, int] = None,
884
+ ):
885
+ r"""
886
+ The call function to the pipeline for generation.
887
+
888
+ Args:
889
+ prompt (`str` or `List[str]`, *optional*):
890
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
891
+ image (`PIL.Image.Image`):
892
+ `Image` or tensor representing an image batch to be inpainted (which parts of the image to be masked
893
+ out with `mask_image` and repainted according to `prompt`).
894
+ mask_image (`PIL.Image.Image`):
895
+ `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
896
+ while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
897
+ (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
898
+ expected shape would be `(B, H, W, 1)`.
899
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
900
+ The height in pixels of the generated image.
901
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
902
+ The width in pixels of the generated image.
903
+ strength (`float`, *optional*, defaults to 1.0):
904
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
905
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
906
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
907
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
908
+ essentially ignores `image`.
909
+ num_inference_steps (`int`, *optional*, defaults to 50):
910
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
911
+ expense of slower inference. This parameter is modulated by `strength`.
912
+ guidance_scale (`float`, *optional*, defaults to 7.5):
913
+ A higher guidance scale value encourages the model to generate images closely linked to the text
914
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
915
+ negative_prompt (`str` or `List[str]`, *optional*):
916
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
917
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
918
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
919
+ The number of images to generate per prompt.
920
+ eta (`float`, *optional*, defaults to 0.0):
921
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
922
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
923
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
924
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
925
+ generation deterministic.
926
+ latents (`torch.FloatTensor`, *optional*):
927
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
928
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
929
+ tensor is generated by sampling using the supplied random `generator`.
930
+ prompt_embeds (`torch.FloatTensor`, *optional*):
931
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
932
+ provided, text embeddings are generated from the `prompt` input argument.
933
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
934
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
935
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
936
+ output_type (`str`, *optional*, defaults to `"pil"`):
937
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
938
+ return_dict (`bool`, *optional*, defaults to `True`):
939
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
940
+ plain tuple.
941
+ callback (`Callable`, *optional*):
942
+ A function that calls every `callback_steps` steps during inference. The function is called with the
943
+ following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
944
+ callback_steps (`int`, *optional*, defaults to 1):
945
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
946
+ every step.
947
+ cross_attention_kwargs (`dict`, *optional*):
948
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
949
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
950
+
951
+ Examples:
952
+
953
+ ```py
954
+ >>> import PIL
955
+ >>> import requests
956
+ >>> import torch
957
+ >>> from io import BytesIO
958
+
959
+ >>> from diffusers import StableDiffusionInpaintPipeline
960
+
961
+
962
+ >>> def download_image(url):
963
+ ... response = requests.get(url)
964
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
965
+
966
+
967
+ >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
968
+ >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
969
+
970
+ >>> init_image = download_image(img_url).resize((512, 512))
971
+ >>> mask_image = download_image(mask_url).resize((512, 512))
972
+
973
+ >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
974
+ ... "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
975
+ ... )
976
+ >>> pipe = pipe.to("cuda")
977
+
978
+ >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
979
+ >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
980
+ ```
981
+
982
+ Returns:
983
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
984
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
985
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
986
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
987
+ "not-safe-for-work" (nsfw) content.
988
+ """
989
+ # 0. Default height and width to unet
990
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
991
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
992
+ prompt = promptA
993
+ negative_prompt = negative_promptA
994
+ # 1. Check inputs
995
+ self.check_inputs(
996
+ prompt,
997
+ height,
998
+ width,
999
+ strength,
1000
+ callback_steps,
1001
+ negative_prompt,
1002
+ prompt_embeds,
1003
+ negative_prompt_embeds,
1004
+ )
1005
+
1006
+ # 2. Define call parameters
1007
+ if prompt is not None and isinstance(prompt, str):
1008
+ batch_size = 1
1009
+ elif prompt is not None and isinstance(prompt, list):
1010
+ batch_size = len(prompt)
1011
+ else:
1012
+ batch_size = prompt_embeds.shape[0]
1013
+
1014
+ device = self._execution_device
1015
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1016
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1017
+ # corresponds to doing no classifier free guidance.
1018
+ do_classifier_free_guidance = guidance_scale > 1.0
1019
+
1020
+ # 3. Encode input prompt
1021
+ text_encoder_lora_scale = (
1022
+ cross_attention_kwargs.get("scale", None)
1023
+ if cross_attention_kwargs is not None
1024
+ else None
1025
+ )
1026
+ prompt_embeds = self._encode_prompt(
1027
+ promptA,
1028
+ promptB,
1029
+ tradoff,
1030
+ device,
1031
+ num_images_per_prompt,
1032
+ do_classifier_free_guidance,
1033
+ negative_promptA,
1034
+ negative_promptB,
1035
+ tradoff_nag,
1036
+ prompt_embeds=prompt_embeds,
1037
+ negative_prompt_embeds=negative_prompt_embeds,
1038
+ lora_scale=text_encoder_lora_scale,
1039
+ )
1040
+
1041
+ # 4. set timesteps
1042
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
1043
+ timesteps, num_inference_steps = self.get_timesteps(
1044
+ num_inference_steps=num_inference_steps, strength=strength, device=device
1045
+ )
1046
+ # check that number of inference steps is not < 1 - as this doesn't make sense
1047
+ if num_inference_steps < 1:
1048
+ raise ValueError(
1049
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
1050
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
1051
+ )
1052
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1053
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1054
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1055
+ is_strength_max = strength == 1.0
1056
+
1057
+ # 5. Preprocess mask and image
1058
+ mask, masked_image, init_image = prepare_mask_and_masked_image(
1059
+ image, mask_image, height, width, return_image=True
1060
+ )
1061
+ mask_condition = mask.clone()
1062
+
1063
+ # 6. Prepare latent variables
1064
+ num_channels_latents = self.vae.config.latent_channels
1065
+ num_channels_unet = self.unet.config.in_channels
1066
+ return_image_latents = num_channels_unet == 4
1067
+
1068
+ latents_outputs = self.prepare_latents(
1069
+ batch_size * num_images_per_prompt,
1070
+ num_channels_latents,
1071
+ height,
1072
+ width,
1073
+ prompt_embeds.dtype,
1074
+ device,
1075
+ generator,
1076
+ latents,
1077
+ image=init_image,
1078
+ timestep=latent_timestep,
1079
+ is_strength_max=is_strength_max,
1080
+ return_noise=True,
1081
+ return_image_latents=return_image_latents,
1082
+ )
1083
+
1084
+ if return_image_latents:
1085
+ latents, noise, image_latents = latents_outputs
1086
+ else:
1087
+ latents, noise = latents_outputs
1088
+
1089
+ # 7. Prepare mask latent variables
1090
+ mask, masked_image_latents = self.prepare_mask_latents(
1091
+ mask,
1092
+ masked_image,
1093
+ batch_size * num_images_per_prompt,
1094
+ height,
1095
+ width,
1096
+ prompt_embeds.dtype,
1097
+ device,
1098
+ generator,
1099
+ do_classifier_free_guidance,
1100
+ )
1101
+
1102
+ # 8. Check that sizes of mask, masked image and latents match
1103
+ if num_channels_unet == 9:
1104
+ # default case for runwayml/stable-diffusion-inpainting
1105
+ num_channels_mask = mask.shape[1]
1106
+ num_channels_masked_image = masked_image_latents.shape[1]
1107
+ if (
1108
+ num_channels_latents + num_channels_mask + num_channels_masked_image
1109
+ != self.unet.config.in_channels
1110
+ ):
1111
+ raise ValueError(
1112
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
1113
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
1114
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1115
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1116
+ " `pipeline.unet` or your `mask_image` or `image` input."
1117
+ )
1118
+ elif num_channels_unet != 4:
1119
+ raise ValueError(
1120
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
1121
+ )
1122
+
1123
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1124
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1125
+
1126
+ # 10. Denoising loop
1127
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1128
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1129
+ for i, t in enumerate(timesteps):
1130
+ # expand the latents if we are doing classifier free guidance
1131
+ latent_model_input = (
1132
+ torch.cat([latents] * 2) if do_classifier_free_guidance else latents
1133
+ )
1134
+
1135
+ # concat latents, mask, masked_image_latents in the channel dimension
1136
+ latent_model_input = self.scheduler.scale_model_input(
1137
+ latent_model_input, t
1138
+ )
1139
+
1140
+ if num_channels_unet == 9:
1141
+ latent_model_input = torch.cat(
1142
+ [latent_model_input, mask, masked_image_latents], dim=1
1143
+ )
1144
+
1145
+ # predict the noise residual
1146
+ if task_class is not None:
1147
+ noise_pred = self.unet(
1148
+ sample=latent_model_input,
1149
+ timestep=t,
1150
+ encoder_hidden_states=prompt_embeds,
1151
+ cross_attention_kwargs=cross_attention_kwargs,
1152
+ return_dict=False,
1153
+ task_class=task_class,
1154
+ )[0]
1155
+ else:
1156
+ noise_pred = self.unet(
1157
+ latent_model_input,
1158
+ t,
1159
+ encoder_hidden_states=prompt_embeds,
1160
+ cross_attention_kwargs=cross_attention_kwargs,
1161
+ return_dict=False,
1162
+ )[0]
1163
+
1164
+ # perform guidance
1165
+ if do_classifier_free_guidance:
1166
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1167
+ noise_pred = noise_pred_uncond + guidance_scale * (
1168
+ noise_pred_text - noise_pred_uncond
1169
+ )
1170
+
1171
+ # compute the previous noisy sample x_t -> x_t-1
1172
+ latents = self.scheduler.step(
1173
+ noise_pred, t, latents, **extra_step_kwargs, return_dict=False
1174
+ )[0]
1175
+
1176
+ if num_channels_unet == 4:
1177
+ init_latents_proper = image_latents[:1]
1178
+ init_mask = mask[:1]
1179
+
1180
+ if i < len(timesteps) - 1:
1181
+ noise_timestep = timesteps[i + 1]
1182
+ init_latents_proper = self.scheduler.add_noise(
1183
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1184
+ )
1185
+
1186
+ latents = (
1187
+ 1 - init_mask
1188
+ ) * init_latents_proper + init_mask * latents
1189
+
1190
+ # call the callback, if provided
1191
+ if i == len(timesteps) - 1 or (
1192
+ (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
1193
+ ):
1194
+ progress_bar.update()
1195
+ if callback is not None and i % callback_steps == 0:
1196
+ callback(self, i, t, {})
1197
+
1198
+ if not output_type == "latent":
1199
+ condition_kwargs = {}
1200
+ if isinstance(self.vae, AsymmetricAutoencoderKL):
1201
+ init_image = init_image.to(
1202
+ device=device, dtype=masked_image_latents.dtype
1203
+ )
1204
+ init_image_condition = init_image.clone()
1205
+ init_image = self._encode_vae_image(init_image, generator=generator)
1206
+ mask_condition = mask_condition.to(
1207
+ device=device, dtype=masked_image_latents.dtype
1208
+ )
1209
+ condition_kwargs = {
1210
+ "image": init_image_condition,
1211
+ "mask": mask_condition,
1212
+ }
1213
+ image = self.vae.decode(
1214
+ latents / self.vae.config.scaling_factor,
1215
+ return_dict=False,
1216
+ **condition_kwargs,
1217
+ )[0]
1218
+ image, has_nsfw_concept = self.run_safety_checker(
1219
+ image, device, prompt_embeds.dtype
1220
+ )
1221
+ else:
1222
+ image = latents
1223
+ has_nsfw_concept = None
1224
+
1225
+ if has_nsfw_concept is None:
1226
+ do_denormalize = [True] * image.shape[0]
1227
+ else:
1228
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1229
+
1230
+ image = self.image_processor.postprocess(
1231
+ image, output_type=output_type, do_denormalize=do_denormalize
1232
+ )
1233
+
1234
+ # Offload last model to CPU
1235
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1236
+ self.final_offload_hook.offload()
1237
+
1238
+ if not return_dict:
1239
+ return (image, has_nsfw_concept)
1240
+
1241
+ return StableDiffusionPipelineOutput(
1242
+ images=image, nsfw_content_detected=has_nsfw_concept
1243
+ )
iopaint/model/power_paint/pipeline_powerpaint_controlnet.py ADDED
@@ -0,0 +1,1775 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
16
+
17
+ import inspect
18
+ import warnings
19
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
20
+
21
+ import numpy as np
22
+ import PIL.Image
23
+ import torch
24
+ import torch.nn.functional as F
25
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
26
+
27
+ from diffusers.image_processor import VaeImageProcessor
28
+ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
29
+ from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
30
+ from diffusers.schedulers import KarrasDiffusionSchedulers
31
+ from diffusers.utils import (
32
+ is_accelerate_available,
33
+ is_accelerate_version,
34
+ logging,
35
+ replace_example_docstring,
36
+ )
37
+ from diffusers.utils.torch_utils import randn_tensor,is_compiled_module
38
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
39
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
40
+ from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
41
+ from diffusers.pipelines.controlnet import MultiControlNetModel
42
+
43
+
44
+
45
+
46
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
47
+
48
+
49
+ EXAMPLE_DOC_STRING = """
50
+ Examples:
51
+ ```py
52
+ >>> # !pip install transformers accelerate
53
+ >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
54
+ >>> from diffusers.utils import load_image
55
+ >>> import numpy as np
56
+ >>> import torch
57
+
58
+ >>> init_image = load_image(
59
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
60
+ ... )
61
+ >>> init_image = init_image.resize((512, 512))
62
+
63
+ >>> generator = torch.Generator(device="cpu").manual_seed(1)
64
+
65
+ >>> mask_image = load_image(
66
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
67
+ ... )
68
+ >>> mask_image = mask_image.resize((512, 512))
69
+
70
+
71
+ >>> def make_inpaint_condition(image, image_mask):
72
+ ... image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
73
+ ... image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
74
+
75
+ ... assert image.shape[0:1] == image_mask.shape[0:1], "image and image_mask must have the same image size"
76
+ ... image[image_mask > 0.5] = -1.0 # set as masked pixel
77
+ ... image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
78
+ ... image = torch.from_numpy(image)
79
+ ... return image
80
+
81
+
82
+ >>> control_image = make_inpaint_condition(init_image, mask_image)
83
+
84
+ >>> controlnet = ControlNetModel.from_pretrained(
85
+ ... "lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16
86
+ ... )
87
+ >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
88
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
89
+ ... )
90
+
91
+ >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
92
+ >>> pipe.enable_model_cpu_offload()
93
+
94
+ >>> # generate image
95
+ >>> image = pipe(
96
+ ... "a handsome man with ray-ban sunglasses",
97
+ ... num_inference_steps=20,
98
+ ... generator=generator,
99
+ ... eta=1.0,
100
+ ... image=init_image,
101
+ ... mask_image=mask_image,
102
+ ... control_image=control_image,
103
+ ... ).images[0]
104
+ ```
105
+ """
106
+
107
+
108
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
109
+ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False):
110
+ """
111
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
112
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
113
+ ``image`` and ``1`` for the ``mask``.
114
+
115
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
116
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
117
+
118
+ Args:
119
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
120
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
121
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
122
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
123
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
124
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
125
+
126
+
127
+ Raises:
128
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
129
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
130
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
131
+ (ot the other way around).
132
+
133
+ Returns:
134
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
135
+ dimensions: ``batch x channels x height x width``.
136
+ """
137
+
138
+ if image is None:
139
+ raise ValueError("`image` input cannot be undefined.")
140
+
141
+ if mask is None:
142
+ raise ValueError("`mask_image` input cannot be undefined.")
143
+
144
+ if isinstance(image, torch.Tensor):
145
+ if not isinstance(mask, torch.Tensor):
146
+ raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
147
+
148
+ # Batch single image
149
+ if image.ndim == 3:
150
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
151
+ image = image.unsqueeze(0)
152
+
153
+ # Batch and add channel dim for single mask
154
+ if mask.ndim == 2:
155
+ mask = mask.unsqueeze(0).unsqueeze(0)
156
+
157
+ # Batch single mask or add channel dim
158
+ if mask.ndim == 3:
159
+ # Single batched mask, no channel dim or single mask not batched but channel dim
160
+ if mask.shape[0] == 1:
161
+ mask = mask.unsqueeze(0)
162
+
163
+ # Batched masks no channel dim
164
+ else:
165
+ mask = mask.unsqueeze(1)
166
+
167
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
168
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
169
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
170
+
171
+ # Check image is in [-1, 1]
172
+ if image.min() < -1 or image.max() > 1:
173
+ raise ValueError("Image should be in [-1, 1] range")
174
+
175
+ # Check mask is in [0, 1]
176
+ if mask.min() < 0 or mask.max() > 1:
177
+ raise ValueError("Mask should be in [0, 1] range")
178
+
179
+ # Binarize mask
180
+ mask[mask < 0.5] = 0
181
+ mask[mask >= 0.5] = 1
182
+
183
+ # Image as float32
184
+ image = image.to(dtype=torch.float32)
185
+ elif isinstance(mask, torch.Tensor):
186
+ raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
187
+ else:
188
+ # preprocess image
189
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
190
+ image = [image]
191
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
192
+ # resize all images w.r.t passed height an width
193
+ image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
194
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
195
+ image = np.concatenate(image, axis=0)
196
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
197
+ image = np.concatenate([i[None, :] for i in image], axis=0)
198
+
199
+ image = image.transpose(0, 3, 1, 2)
200
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
201
+
202
+ # preprocess mask
203
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
204
+ mask = [mask]
205
+
206
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
207
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
208
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
209
+ mask = mask.astype(np.float32) / 255.0
210
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
211
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
212
+
213
+ mask[mask < 0.5] = 0
214
+ mask[mask >= 0.5] = 1
215
+ mask = torch.from_numpy(mask)
216
+
217
+ masked_image = image * (mask < 0.5)
218
+
219
+ # n.b. ensure backwards compatibility as old function does not return image
220
+ if return_image:
221
+ return mask, masked_image, image
222
+
223
+ return mask, masked_image
224
+
225
+
226
+ class StableDiffusionControlNetInpaintPipeline(
227
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
228
+ ):
229
+ r"""
230
+ Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
231
+
232
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
233
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
234
+
235
+ In addition the pipeline inherits the following loading methods:
236
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
237
+
238
+ <Tip>
239
+
240
+ This pipeline can be used both with checkpoints that have been specifically fine-tuned for inpainting, such as
241
+ [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)
242
+ as well as default text-to-image stable diffusion checkpoints, such as
243
+ [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5).
244
+ Default text-to-image stable diffusion checkpoints might be preferable for controlnets that have been fine-tuned on
245
+ those, such as [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
246
+
247
+ </Tip>
248
+
249
+ Args:
250
+ vae ([`AutoencoderKL`]):
251
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
252
+ text_encoder ([`CLIPTextModel`]):
253
+ Frozen text-encoder. Stable Diffusion uses the text portion of
254
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
255
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
256
+ tokenizer (`CLIPTokenizer`):
257
+ Tokenizer of class
258
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
259
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
260
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
261
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
262
+ as a list, the outputs from each ControlNet are added together to create one combined additional
263
+ conditioning.
264
+ scheduler ([`SchedulerMixin`]):
265
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
266
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
267
+ safety_checker ([`StableDiffusionSafetyChecker`]):
268
+ Classification module that estimates whether generated images could be considered offensive or harmful.
269
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
270
+ feature_extractor ([`CLIPImageProcessor`]):
271
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
272
+ """
273
+ _optional_components = ["safety_checker", "feature_extractor"]
274
+
275
+ def __init__(
276
+ self,
277
+ vae: AutoencoderKL,
278
+ text_encoder: CLIPTextModel,
279
+ tokenizer: CLIPTokenizer,
280
+ unet: UNet2DConditionModel,
281
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
282
+ scheduler: KarrasDiffusionSchedulers,
283
+ safety_checker: StableDiffusionSafetyChecker,
284
+ feature_extractor: CLIPImageProcessor,
285
+ requires_safety_checker: bool = True,
286
+ ):
287
+ super().__init__()
288
+
289
+ if safety_checker is None and requires_safety_checker:
290
+ logger.warning(
291
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
292
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
293
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
294
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
295
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
296
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
297
+ )
298
+
299
+ if safety_checker is not None and feature_extractor is None:
300
+ raise ValueError(
301
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
302
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
303
+ )
304
+
305
+ if isinstance(controlnet, (list, tuple)):
306
+ controlnet = MultiControlNetModel(controlnet)
307
+
308
+ self.register_modules(
309
+ vae=vae,
310
+ text_encoder=text_encoder,
311
+ tokenizer=tokenizer,
312
+ unet=unet,
313
+ controlnet=controlnet,
314
+ scheduler=scheduler,
315
+ safety_checker=safety_checker,
316
+ feature_extractor=feature_extractor,
317
+ )
318
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
319
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
320
+ self.control_image_processor = VaeImageProcessor(
321
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
322
+ )
323
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
324
+
325
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
326
+ def enable_vae_slicing(self):
327
+ r"""
328
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
329
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
330
+ """
331
+ self.vae.enable_slicing()
332
+
333
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
334
+ def disable_vae_slicing(self):
335
+ r"""
336
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
337
+ computing decoding in one step.
338
+ """
339
+ self.vae.disable_slicing()
340
+
341
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
342
+ def enable_vae_tiling(self):
343
+ r"""
344
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
345
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
346
+ processing larger images.
347
+ """
348
+ self.vae.enable_tiling()
349
+
350
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
351
+ def disable_vae_tiling(self):
352
+ r"""
353
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
354
+ computing decoding in one step.
355
+ """
356
+ self.vae.disable_tiling()
357
+
358
+ def enable_model_cpu_offload(self, gpu_id=0):
359
+ r"""
360
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
361
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
362
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
363
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
364
+ """
365
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
366
+ from accelerate import cpu_offload_with_hook
367
+ else:
368
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
369
+
370
+ device = torch.device(f"cuda:{gpu_id}")
371
+
372
+ hook = None
373
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
374
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
375
+
376
+ if self.safety_checker is not None:
377
+ # the safety checker can offload the vae again
378
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
379
+
380
+ # control net hook has be manually offloaded as it alternates with unet
381
+ cpu_offload_with_hook(self.controlnet, device)
382
+
383
+ # We'll offload the last model manually.
384
+ self.final_offload_hook = hook
385
+
386
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
387
+ def _encode_prompt(
388
+ self,
389
+ promptA,
390
+ promptB,
391
+ t,
392
+ device,
393
+ num_images_per_prompt,
394
+ do_classifier_free_guidance,
395
+ negative_promptA=None,
396
+ negative_promptB=None,
397
+ t_nag = None,
398
+ prompt_embeds: Optional[torch.FloatTensor] = None,
399
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
400
+ lora_scale: Optional[float] = None,
401
+ ):
402
+ r"""
403
+ Encodes the prompt into text encoder hidden states.
404
+
405
+ Args:
406
+ prompt (`str` or `List[str]`, *optional*):
407
+ prompt to be encoded
408
+ device: (`torch.device`):
409
+ torch device
410
+ num_images_per_prompt (`int`):
411
+ number of images that should be generated per prompt
412
+ do_classifier_free_guidance (`bool`):
413
+ whether to use classifier free guidance or not
414
+ negative_prompt (`str` or `List[str]`, *optional*):
415
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
416
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
417
+ less than `1`).
418
+ prompt_embeds (`torch.FloatTensor`, *optional*):
419
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
420
+ provided, text embeddings will be generated from `prompt` input argument.
421
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
422
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
423
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
424
+ argument.
425
+ lora_scale (`float`, *optional*):
426
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
427
+ """
428
+ # set lora scale so that monkey patched LoRA
429
+ # function of text encoder can correctly access it
430
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
431
+ self._lora_scale = lora_scale
432
+
433
+ prompt = promptA
434
+ negative_prompt = negative_promptA
435
+
436
+ if promptA is not None and isinstance(promptA, str):
437
+ batch_size = 1
438
+ elif promptA is not None and isinstance(promptA, list):
439
+ batch_size = len(promptA)
440
+ else:
441
+ batch_size = prompt_embeds.shape[0]
442
+
443
+ if prompt_embeds is None:
444
+ # textual inversion: procecss multi-vector tokens if necessary
445
+ if isinstance(self, TextualInversionLoaderMixin):
446
+ promptA = self.maybe_convert_prompt(promptA, self.tokenizer)
447
+
448
+ text_inputsA = self.tokenizer(
449
+ promptA,
450
+ padding="max_length",
451
+ max_length=self.tokenizer.model_max_length,
452
+ truncation=True,
453
+ return_tensors="pt",
454
+ )
455
+ text_inputsB = self.tokenizer(
456
+ promptB,
457
+ padding="max_length",
458
+ max_length=self.tokenizer.model_max_length,
459
+ truncation=True,
460
+ return_tensors="pt",
461
+ )
462
+ text_input_idsA = text_inputsA.input_ids
463
+ text_input_idsB = text_inputsB.input_ids
464
+ untruncated_ids = self.tokenizer(promptA, padding="longest", return_tensors="pt").input_ids
465
+
466
+ if untruncated_ids.shape[-1] >= text_input_idsA.shape[-1] and not torch.equal(
467
+ text_input_idsA, untruncated_ids
468
+ ):
469
+ removed_text = self.tokenizer.batch_decode(
470
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
471
+ )
472
+ logger.warning(
473
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
474
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
475
+ )
476
+
477
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
478
+ attention_mask = text_inputsA.attention_mask.to(device)
479
+ else:
480
+ attention_mask = None
481
+
482
+ # print("text_input_idsA: ",text_input_idsA)
483
+ # print("text_input_idsB: ",text_input_idsB)
484
+ # print('t: ',t)
485
+
486
+ prompt_embedsA = self.text_encoder(
487
+ text_input_idsA.to(device),
488
+ attention_mask=attention_mask,
489
+ )
490
+ prompt_embedsA = prompt_embedsA[0]
491
+
492
+ prompt_embedsB = self.text_encoder(
493
+ text_input_idsB.to(device),
494
+ attention_mask=attention_mask,
495
+ )
496
+ prompt_embedsB = prompt_embedsB[0]
497
+ prompt_embeds = prompt_embedsA*(t)+(1-t)*prompt_embedsB
498
+ # print("prompt_embeds: ",prompt_embeds)
499
+
500
+ if self.text_encoder is not None:
501
+ prompt_embeds_dtype = self.text_encoder.dtype
502
+ elif self.unet is not None:
503
+ prompt_embeds_dtype = self.unet.dtype
504
+ else:
505
+ prompt_embeds_dtype = prompt_embeds.dtype
506
+
507
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
508
+
509
+ bs_embed, seq_len, _ = prompt_embeds.shape
510
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
511
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
512
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
513
+
514
+ # get unconditional embeddings for classifier free guidance
515
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
516
+ uncond_tokensA: List[str]
517
+ uncond_tokensB: List[str]
518
+ if negative_prompt is None:
519
+ uncond_tokensA = [""] * batch_size
520
+ uncond_tokensB = [""] * batch_size
521
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
522
+ raise TypeError(
523
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
524
+ f" {type(prompt)}."
525
+ )
526
+ elif isinstance(negative_prompt, str):
527
+ uncond_tokensA = [negative_promptA]
528
+ uncond_tokensB = [negative_promptB]
529
+ elif batch_size != len(negative_prompt):
530
+ raise ValueError(
531
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
532
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
533
+ " the batch size of `prompt`."
534
+ )
535
+ else:
536
+ uncond_tokensA = negative_promptA
537
+ uncond_tokensB = negative_promptB
538
+
539
+ # textual inversion: procecss multi-vector tokens if necessary
540
+ if isinstance(self, TextualInversionLoaderMixin):
541
+ uncond_tokensA = self.maybe_convert_prompt(uncond_tokensA, self.tokenizer)
542
+ uncond_tokensB = self.maybe_convert_prompt(uncond_tokensB, self.tokenizer)
543
+
544
+ max_length = prompt_embeds.shape[1]
545
+ uncond_inputA = self.tokenizer(
546
+ uncond_tokensA,
547
+ padding="max_length",
548
+ max_length=max_length,
549
+ truncation=True,
550
+ return_tensors="pt",
551
+ )
552
+ uncond_inputB = self.tokenizer(
553
+ uncond_tokensB,
554
+ padding="max_length",
555
+ max_length=max_length,
556
+ truncation=True,
557
+ return_tensors="pt",
558
+ )
559
+
560
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
561
+ attention_mask = uncond_inputA.attention_mask.to(device)
562
+ else:
563
+ attention_mask = None
564
+
565
+ negative_prompt_embedsA = self.text_encoder(
566
+ uncond_inputA.input_ids.to(device),
567
+ attention_mask=attention_mask,
568
+ )
569
+ negative_prompt_embedsB = self.text_encoder(
570
+ uncond_inputB.input_ids.to(device),
571
+ attention_mask=attention_mask,
572
+ )
573
+ negative_prompt_embeds = negative_prompt_embedsA[0]*(t_nag)+(1-t_nag)*negative_prompt_embedsB[0]
574
+
575
+ # negative_prompt_embeds = negative_prompt_embeds[0]
576
+
577
+ if do_classifier_free_guidance:
578
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
579
+ seq_len = negative_prompt_embeds.shape[1]
580
+
581
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
582
+
583
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
584
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
585
+
586
+ # For classifier free guidance, we need to do two forward passes.
587
+ # Here we concatenate the unconditional and text embeddings into a single batch
588
+ # to avoid doing two forward passes
589
+ # print("prompt_embeds: ",prompt_embeds)
590
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
591
+
592
+ return prompt_embeds
593
+
594
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
595
+ def run_safety_checker(self, image, device, dtype):
596
+ if self.safety_checker is None:
597
+ has_nsfw_concept = None
598
+ else:
599
+ if torch.is_tensor(image):
600
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
601
+ else:
602
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
603
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
604
+ image, has_nsfw_concept = self.safety_checker(
605
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
606
+ )
607
+ return image, has_nsfw_concept
608
+
609
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
610
+ def decode_latents(self, latents):
611
+ warnings.warn(
612
+ "The decode_latents method is deprecated and will be removed in a future version. Please"
613
+ " use VaeImageProcessor instead",
614
+ FutureWarning,
615
+ )
616
+ latents = 1 / self.vae.config.scaling_factor * latents
617
+ image = self.vae.decode(latents, return_dict=False)[0]
618
+ image = (image / 2 + 0.5).clamp(0, 1)
619
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
620
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
621
+ return image
622
+
623
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
624
+ def prepare_extra_step_kwargs(self, generator, eta):
625
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
626
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
627
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
628
+ # and should be between [0, 1]
629
+
630
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
631
+ extra_step_kwargs = {}
632
+ if accepts_eta:
633
+ extra_step_kwargs["eta"] = eta
634
+
635
+ # check if the scheduler accepts generator
636
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
637
+ if accepts_generator:
638
+ extra_step_kwargs["generator"] = generator
639
+ return extra_step_kwargs
640
+
641
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
642
+ def get_timesteps(self, num_inference_steps, strength, device):
643
+ # get the original timestep using init_timestep
644
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
645
+
646
+ t_start = max(num_inference_steps - init_timestep, 0)
647
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
648
+
649
+ return timesteps, num_inference_steps - t_start
650
+
651
+ def check_inputs(
652
+ self,
653
+ prompt,
654
+ image,
655
+ height,
656
+ width,
657
+ callback_steps,
658
+ negative_prompt=None,
659
+ prompt_embeds=None,
660
+ negative_prompt_embeds=None,
661
+ controlnet_conditioning_scale=1.0,
662
+ control_guidance_start=0.0,
663
+ control_guidance_end=1.0,
664
+ ):
665
+ if height % 8 != 0 or width % 8 != 0:
666
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
667
+
668
+ if (callback_steps is None) or (
669
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
670
+ ):
671
+ raise ValueError(
672
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
673
+ f" {type(callback_steps)}."
674
+ )
675
+
676
+ if prompt is not None and prompt_embeds is not None:
677
+ raise ValueError(
678
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
679
+ " only forward one of the two."
680
+ )
681
+ elif prompt is None and prompt_embeds is None:
682
+ raise ValueError(
683
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
684
+ )
685
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
686
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
687
+
688
+ if negative_prompt is not None and negative_prompt_embeds is not None:
689
+ raise ValueError(
690
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
691
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
692
+ )
693
+
694
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
695
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
696
+ raise ValueError(
697
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
698
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
699
+ f" {negative_prompt_embeds.shape}."
700
+ )
701
+
702
+ # `prompt` needs more sophisticated handling when there are multiple
703
+ # conditionings.
704
+ if isinstance(self.controlnet, MultiControlNetModel):
705
+ if isinstance(prompt, list):
706
+ logger.warning(
707
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
708
+ " prompts. The conditionings will be fixed across the prompts."
709
+ )
710
+
711
+ # Check `image`
712
+ is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
713
+ self.controlnet, torch._dynamo.eval_frame.OptimizedModule
714
+ )
715
+
716
+ if (
717
+ isinstance(self.controlnet, ControlNetModel)
718
+ or is_compiled
719
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
720
+ ):
721
+ self.check_image(image, prompt, prompt_embeds)
722
+ elif (
723
+ isinstance(self.controlnet, MultiControlNetModel)
724
+ or is_compiled
725
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
726
+ ):
727
+ if not isinstance(image, list):
728
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
729
+
730
+ # When `image` is a nested list:
731
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
732
+ elif any(isinstance(i, list) for i in image):
733
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
734
+ elif len(image) != len(self.controlnet.nets):
735
+ raise ValueError(
736
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
737
+ )
738
+
739
+ for image_ in image:
740
+ self.check_image(image_, prompt, prompt_embeds)
741
+ else:
742
+ assert False
743
+
744
+ # Check `controlnet_conditioning_scale`
745
+ if (
746
+ isinstance(self.controlnet, ControlNetModel)
747
+ or is_compiled
748
+ and isinstance(self.controlnet._orig_mod, ControlNetModel)
749
+ ):
750
+ if not isinstance(controlnet_conditioning_scale, float):
751
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
752
+ elif (
753
+ isinstance(self.controlnet, MultiControlNetModel)
754
+ or is_compiled
755
+ and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
756
+ ):
757
+ if isinstance(controlnet_conditioning_scale, list):
758
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
759
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
760
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
761
+ self.controlnet.nets
762
+ ):
763
+ raise ValueError(
764
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
765
+ " the same length as the number of controlnets"
766
+ )
767
+ else:
768
+ assert False
769
+
770
+ if len(control_guidance_start) != len(control_guidance_end):
771
+ raise ValueError(
772
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
773
+ )
774
+
775
+ if isinstance(self.controlnet, MultiControlNetModel):
776
+ if len(control_guidance_start) != len(self.controlnet.nets):
777
+ raise ValueError(
778
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
779
+ )
780
+
781
+ for start, end in zip(control_guidance_start, control_guidance_end):
782
+ if start >= end:
783
+ raise ValueError(
784
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
785
+ )
786
+ if start < 0.0:
787
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
788
+ if end > 1.0:
789
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
790
+
791
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
792
+ def check_image(self, image, prompt, prompt_embeds):
793
+ image_is_pil = isinstance(image, PIL.Image.Image)
794
+ image_is_tensor = isinstance(image, torch.Tensor)
795
+ image_is_np = isinstance(image, np.ndarray)
796
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
797
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
798
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
799
+
800
+ if (
801
+ not image_is_pil
802
+ and not image_is_tensor
803
+ and not image_is_np
804
+ and not image_is_pil_list
805
+ and not image_is_tensor_list
806
+ and not image_is_np_list
807
+ ):
808
+ raise TypeError(
809
+ f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
810
+ )
811
+
812
+ if image_is_pil:
813
+ image_batch_size = 1
814
+ else:
815
+ image_batch_size = len(image)
816
+
817
+ if prompt is not None and isinstance(prompt, str):
818
+ prompt_batch_size = 1
819
+ elif prompt is not None and isinstance(prompt, list):
820
+ prompt_batch_size = len(prompt)
821
+ elif prompt_embeds is not None:
822
+ prompt_batch_size = prompt_embeds.shape[0]
823
+
824
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
825
+ raise ValueError(
826
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
827
+ )
828
+
829
+ # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
830
+ def prepare_control_image(
831
+ self,
832
+ image,
833
+ width,
834
+ height,
835
+ batch_size,
836
+ num_images_per_prompt,
837
+ device,
838
+ dtype,
839
+ do_classifier_free_guidance=False,
840
+ guess_mode=False,
841
+ ):
842
+ image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
843
+ image_batch_size = image.shape[0]
844
+
845
+ if image_batch_size == 1:
846
+ repeat_by = batch_size
847
+ else:
848
+ # image batch size is the same as prompt batch size
849
+ repeat_by = num_images_per_prompt
850
+
851
+ image = image.repeat_interleave(repeat_by, dim=0)
852
+
853
+ image = image.to(device=device, dtype=dtype)
854
+
855
+ if do_classifier_free_guidance and not guess_mode:
856
+ image = torch.cat([image] * 2)
857
+
858
+ return image
859
+
860
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_latents
861
+ def prepare_latents(
862
+ self,
863
+ batch_size,
864
+ num_channels_latents,
865
+ height,
866
+ width,
867
+ dtype,
868
+ device,
869
+ generator,
870
+ latents=None,
871
+ image=None,
872
+ timestep=None,
873
+ is_strength_max=True,
874
+ return_noise=False,
875
+ return_image_latents=False,
876
+ ):
877
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
878
+ if isinstance(generator, list) and len(generator) != batch_size:
879
+ raise ValueError(
880
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
881
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
882
+ )
883
+
884
+ if (image is None or timestep is None) and not is_strength_max:
885
+ raise ValueError(
886
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
887
+ "However, either the image or the noise timestep has not been provided."
888
+ )
889
+
890
+ if return_image_latents or (latents is None and not is_strength_max):
891
+ image = image.to(device=device, dtype=dtype)
892
+ image_latents = self._encode_vae_image(image=image, generator=generator)
893
+
894
+ if latents is None:
895
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
896
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
897
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
898
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
899
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
900
+ else:
901
+ noise = latents.to(device)
902
+ latents = noise * self.scheduler.init_noise_sigma
903
+
904
+ outputs = (latents,)
905
+
906
+ if return_noise:
907
+ outputs += (noise,)
908
+
909
+ if return_image_latents:
910
+ outputs += (image_latents,)
911
+
912
+ return outputs
913
+
914
+ def _default_height_width(self, height, width, image):
915
+ # NOTE: It is possible that a list of images have different
916
+ # dimensions for each image, so just checking the first image
917
+ # is not _exactly_ correct, but it is simple.
918
+ while isinstance(image, list):
919
+ image = image[0]
920
+
921
+ if height is None:
922
+ if isinstance(image, PIL.Image.Image):
923
+ height = image.height
924
+ elif isinstance(image, torch.Tensor):
925
+ height = image.shape[2]
926
+
927
+ height = (height // 8) * 8 # round down to nearest multiple of 8
928
+
929
+ if width is None:
930
+ if isinstance(image, PIL.Image.Image):
931
+ width = image.width
932
+ elif isinstance(image, torch.Tensor):
933
+ width = image.shape[3]
934
+
935
+ width = (width // 8) * 8 # round down to nearest multiple of 8
936
+
937
+ return height, width
938
+
939
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
940
+ def prepare_mask_latents(
941
+ self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
942
+ ):
943
+ # resize the mask to latents shape as we concatenate the mask to the latents
944
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
945
+ # and half precision
946
+ mask = torch.nn.functional.interpolate(
947
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
948
+ )
949
+ mask = mask.to(device=device, dtype=dtype)
950
+
951
+ masked_image = masked_image.to(device=device, dtype=dtype)
952
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
953
+
954
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
955
+ if mask.shape[0] < batch_size:
956
+ if not batch_size % mask.shape[0] == 0:
957
+ raise ValueError(
958
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
959
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
960
+ " of masks that you pass is divisible by the total requested batch size."
961
+ )
962
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
963
+ if masked_image_latents.shape[0] < batch_size:
964
+ if not batch_size % masked_image_latents.shape[0] == 0:
965
+ raise ValueError(
966
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
967
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
968
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
969
+ )
970
+ masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
971
+
972
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
973
+ masked_image_latents = (
974
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
975
+ )
976
+
977
+ # aligning device to prevent device errors when concating it with the latent model input
978
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
979
+ return mask, masked_image_latents
980
+
981
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
982
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
983
+ if isinstance(generator, list):
984
+ image_latents = [
985
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i])
986
+ for i in range(image.shape[0])
987
+ ]
988
+ image_latents = torch.cat(image_latents, dim=0)
989
+ else:
990
+ image_latents = self.vae.encode(image).latent_dist.sample(generator=generator)
991
+
992
+ image_latents = self.vae.config.scaling_factor * image_latents
993
+
994
+ return image_latents
995
+
996
+ @torch.no_grad()
997
+ def predict_woControl(
998
+ self,
999
+ promptA: Union[str, List[str]] = None,
1000
+ promptB: Union[str, List[str]] = None,
1001
+ image: Union[torch.FloatTensor, PIL.Image.Image] = None,
1002
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
1003
+ height: Optional[int] = None,
1004
+ width: Optional[int] = None,
1005
+ strength: float = 1.0,
1006
+ tradoff: float = 1.0,
1007
+ tradoff_nag: float = 1.0,
1008
+ num_inference_steps: int = 50,
1009
+ guidance_scale: float = 7.5,
1010
+ negative_promptA: Optional[Union[str, List[str]]] = None,
1011
+ negative_promptB: Optional[Union[str, List[str]]] = None,
1012
+ num_images_per_prompt: Optional[int] = 1,
1013
+ eta: float = 0.0,
1014
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1015
+ latents: Optional[torch.FloatTensor] = None,
1016
+ prompt_embeds: Optional[torch.FloatTensor] = None,
1017
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1018
+ output_type: Optional[str] = "pil",
1019
+ return_dict: bool = True,
1020
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
1021
+ callback_steps: int = 1,
1022
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1023
+ task_class: Union[torch.Tensor, float, int] = None,
1024
+ ):
1025
+ r"""
1026
+ The call function to the pipeline for generation.
1027
+
1028
+ Args:
1029
+ prompt (`str` or `List[str]`, *optional*):
1030
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
1031
+ image (`PIL.Image.Image`):
1032
+ `Image` or tensor representing an image batch to be inpainted (which parts of the image to be masked
1033
+ out with `mask_image` and repainted according to `prompt`).
1034
+ mask_image (`PIL.Image.Image`):
1035
+ `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted
1036
+ while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
1037
+ (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
1038
+ expected shape would be `(B, H, W, 1)`.
1039
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1040
+ The height in pixels of the generated image.
1041
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1042
+ The width in pixels of the generated image.
1043
+ strength (`float`, *optional*, defaults to 1.0):
1044
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
1045
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
1046
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
1047
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
1048
+ essentially ignores `image`.
1049
+ num_inference_steps (`int`, *optional*, defaults to 50):
1050
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1051
+ expense of slower inference. This parameter is modulated by `strength`.
1052
+ guidance_scale (`float`, *optional*, defaults to 7.5):
1053
+ A higher guidance scale value encourages the model to generate images closely linked to the text
1054
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
1055
+ negative_prompt (`str` or `List[str]`, *optional*):
1056
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
1057
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
1058
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
1059
+ The number of images to generate per prompt.
1060
+ eta (`float`, *optional*, defaults to 0.0):
1061
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
1062
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
1063
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
1064
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
1065
+ generation deterministic.
1066
+ latents (`torch.FloatTensor`, *optional*):
1067
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
1068
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1069
+ tensor is generated by sampling using the supplied random `generator`.
1070
+ prompt_embeds (`torch.FloatTensor`, *optional*):
1071
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
1072
+ provided, text embeddings are generated from the `prompt` input argument.
1073
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1074
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
1075
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
1076
+ output_type (`str`, *optional*, defaults to `"pil"`):
1077
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
1078
+ return_dict (`bool`, *optional*, defaults to `True`):
1079
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1080
+ plain tuple.
1081
+ callback (`Callable`, *optional*):
1082
+ A function that calls every `callback_steps` steps during inference. The function is called with the
1083
+ following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
1084
+ callback_steps (`int`, *optional*, defaults to 1):
1085
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
1086
+ every step.
1087
+ cross_attention_kwargs (`dict`, *optional*):
1088
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
1089
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1090
+
1091
+ Examples:
1092
+
1093
+ ```py
1094
+ >>> import PIL
1095
+ >>> import requests
1096
+ >>> import torch
1097
+ >>> from io import BytesIO
1098
+
1099
+ >>> from diffusers import StableDiffusionInpaintPipeline
1100
+
1101
+
1102
+ >>> def download_image(url):
1103
+ ... response = requests.get(url)
1104
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
1105
+
1106
+
1107
+ >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
1108
+ >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
1109
+
1110
+ >>> init_image = download_image(img_url).resize((512, 512))
1111
+ >>> mask_image = download_image(mask_url).resize((512, 512))
1112
+
1113
+ >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
1114
+ ... "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
1115
+ ... )
1116
+ >>> pipe = pipe.to("cuda")
1117
+
1118
+ >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
1119
+ >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
1120
+ ```
1121
+
1122
+ Returns:
1123
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1124
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
1125
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
1126
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
1127
+ "not-safe-for-work" (nsfw) content.
1128
+ """
1129
+ # 0. Default height and width to unet
1130
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
1131
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
1132
+ prompt = promptA
1133
+ negative_prompt = negative_promptA
1134
+ # 1. Check inputs
1135
+ self.check_inputs(
1136
+ prompt,
1137
+ height,
1138
+ width,
1139
+ strength,
1140
+ callback_steps,
1141
+ negative_prompt,
1142
+ prompt_embeds,
1143
+ negative_prompt_embeds,
1144
+ )
1145
+
1146
+ # 2. Define call parameters
1147
+ if prompt is not None and isinstance(prompt, str):
1148
+ batch_size = 1
1149
+ elif prompt is not None and isinstance(prompt, list):
1150
+ batch_size = len(prompt)
1151
+ else:
1152
+ batch_size = prompt_embeds.shape[0]
1153
+
1154
+ device = self._execution_device
1155
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1156
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1157
+ # corresponds to doing no classifier free guidance.
1158
+ do_classifier_free_guidance = guidance_scale > 1.0
1159
+
1160
+ # 3. Encode input prompt
1161
+ text_encoder_lora_scale = (
1162
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1163
+ )
1164
+ prompt_embeds = self._encode_prompt(
1165
+ promptA,
1166
+ promptB,
1167
+ tradoff,
1168
+ device,
1169
+ num_images_per_prompt,
1170
+ do_classifier_free_guidance,
1171
+ negative_promptA,
1172
+ negative_promptB,
1173
+ tradoff_nag,
1174
+ prompt_embeds=prompt_embeds,
1175
+ negative_prompt_embeds=negative_prompt_embeds,
1176
+ lora_scale=text_encoder_lora_scale,
1177
+ )
1178
+
1179
+ # 4. set timesteps
1180
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
1181
+ timesteps, num_inference_steps = self.get_timesteps(
1182
+ num_inference_steps=num_inference_steps, strength=strength, device=device
1183
+ )
1184
+ # check that number of inference steps is not < 1 - as this doesn't make sense
1185
+ if num_inference_steps < 1:
1186
+ raise ValueError(
1187
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
1188
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
1189
+ )
1190
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1191
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1192
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1193
+ is_strength_max = strength == 1.0
1194
+
1195
+ # 5. Preprocess mask and image
1196
+ mask, masked_image, init_image = prepare_mask_and_masked_image(
1197
+ image, mask_image, height, width, return_image=True
1198
+ )
1199
+ mask_condition = mask.clone()
1200
+
1201
+ # 6. Prepare latent variables
1202
+ num_channels_latents = self.vae.config.latent_channels
1203
+ num_channels_unet = self.unet.config.in_channels
1204
+ return_image_latents = num_channels_unet == 4
1205
+
1206
+ latents_outputs = self.prepare_latents(
1207
+ batch_size * num_images_per_prompt,
1208
+ num_channels_latents,
1209
+ height,
1210
+ width,
1211
+ prompt_embeds.dtype,
1212
+ device,
1213
+ generator,
1214
+ latents,
1215
+ image=init_image,
1216
+ timestep=latent_timestep,
1217
+ is_strength_max=is_strength_max,
1218
+ return_noise=True,
1219
+ return_image_latents=return_image_latents,
1220
+ )
1221
+
1222
+ if return_image_latents:
1223
+ latents, noise, image_latents = latents_outputs
1224
+ else:
1225
+ latents, noise = latents_outputs
1226
+
1227
+ # 7. Prepare mask latent variables
1228
+ mask, masked_image_latents = self.prepare_mask_latents(
1229
+ mask,
1230
+ masked_image,
1231
+ batch_size * num_images_per_prompt,
1232
+ height,
1233
+ width,
1234
+ prompt_embeds.dtype,
1235
+ device,
1236
+ generator,
1237
+ do_classifier_free_guidance,
1238
+ )
1239
+
1240
+ # 8. Check that sizes of mask, masked image and latents match
1241
+ if num_channels_unet == 9:
1242
+ # default case for runwayml/stable-diffusion-inpainting
1243
+ num_channels_mask = mask.shape[1]
1244
+ num_channels_masked_image = masked_image_latents.shape[1]
1245
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
1246
+ raise ValueError(
1247
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
1248
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
1249
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1250
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1251
+ " `pipeline.unet` or your `mask_image` or `image` input."
1252
+ )
1253
+ elif num_channels_unet != 4:
1254
+ raise ValueError(
1255
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
1256
+ )
1257
+
1258
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1259
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1260
+
1261
+ # 10. Denoising loop
1262
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1263
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1264
+ for i, t in enumerate(timesteps):
1265
+ # expand the latents if we are doing classifier free guidance
1266
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
1267
+
1268
+ # concat latents, mask, masked_image_latents in the channel dimension
1269
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1270
+
1271
+ if num_channels_unet == 9:
1272
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
1273
+
1274
+ # predict the noise residual
1275
+ if task_class is not None:
1276
+ noise_pred = self.unet(
1277
+ sample = latent_model_input,
1278
+ timestep = t,
1279
+ encoder_hidden_states=prompt_embeds,
1280
+ cross_attention_kwargs=cross_attention_kwargs,
1281
+ return_dict=False,
1282
+ task_class = task_class,
1283
+ )[0]
1284
+ else:
1285
+ noise_pred = self.unet(
1286
+ latent_model_input,
1287
+ t,
1288
+ encoder_hidden_states=prompt_embeds,
1289
+ cross_attention_kwargs=cross_attention_kwargs,
1290
+ return_dict=False,
1291
+ )[0]
1292
+
1293
+ # perform guidance
1294
+ if do_classifier_free_guidance:
1295
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1296
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1297
+
1298
+ # compute the previous noisy sample x_t -> x_t-1
1299
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1300
+
1301
+ if num_channels_unet == 4:
1302
+ init_latents_proper = image_latents[:1]
1303
+ init_mask = mask[:1]
1304
+
1305
+ if i < len(timesteps) - 1:
1306
+ noise_timestep = timesteps[i + 1]
1307
+ init_latents_proper = self.scheduler.add_noise(
1308
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1309
+ )
1310
+
1311
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1312
+
1313
+ # call the callback, if provided
1314
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1315
+ progress_bar.update()
1316
+ if callback is not None and i % callback_steps == 0:
1317
+ callback(i, t, latents)
1318
+
1319
+ if not output_type == "latent":
1320
+ condition_kwargs = {}
1321
+ if isinstance(self.vae, AsymmetricAutoencoderKL):
1322
+ init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
1323
+ init_image_condition = init_image.clone()
1324
+ init_image = self._encode_vae_image(init_image, generator=generator)
1325
+ mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
1326
+ condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
1327
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, **condition_kwargs)[0]
1328
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
1329
+ else:
1330
+ image = latents
1331
+ has_nsfw_concept = None
1332
+
1333
+ if has_nsfw_concept is None:
1334
+ do_denormalize = [True] * image.shape[0]
1335
+ else:
1336
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1337
+
1338
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1339
+
1340
+ # Offload last model to CPU
1341
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1342
+ self.final_offload_hook.offload()
1343
+
1344
+ if not return_dict:
1345
+ return (image, has_nsfw_concept)
1346
+
1347
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
1348
+
1349
+
1350
+ @torch.no_grad()
1351
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
1352
+ def __call__(
1353
+ self,
1354
+ promptA: Union[str, List[str]] = None,
1355
+ promptB: Union[str, List[str]] = None,
1356
+ image: Union[torch.Tensor, PIL.Image.Image] = None,
1357
+ mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
1358
+ control_image: Union[
1359
+ torch.FloatTensor,
1360
+ PIL.Image.Image,
1361
+ np.ndarray,
1362
+ List[torch.FloatTensor],
1363
+ List[PIL.Image.Image],
1364
+ List[np.ndarray],
1365
+ ] = None,
1366
+ height: Optional[int] = None,
1367
+ width: Optional[int] = None,
1368
+ strength: float = 1.0,
1369
+ tradoff: float = 1.0,
1370
+ tradoff_nag: float = 1.0,
1371
+ num_inference_steps: int = 50,
1372
+ guidance_scale: float = 7.5,
1373
+ negative_promptA: Optional[Union[str, List[str]]] = None,
1374
+ negative_promptB: Optional[Union[str, List[str]]] = None,
1375
+ num_images_per_prompt: Optional[int] = 1,
1376
+ eta: float = 0.0,
1377
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1378
+ latents: Optional[torch.FloatTensor] = None,
1379
+ prompt_embeds: Optional[torch.FloatTensor] = None,
1380
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1381
+ output_type: Optional[str] = "pil",
1382
+ return_dict: bool = True,
1383
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
1384
+ callback_steps: int = 1,
1385
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1386
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.5,
1387
+ guess_mode: bool = False,
1388
+ control_guidance_start: Union[float, List[float]] = 0.0,
1389
+ control_guidance_end: Union[float, List[float]] = 1.0,
1390
+ ):
1391
+ r"""
1392
+ Function invoked when calling the pipeline for generation.
1393
+
1394
+ Args:
1395
+ prompt (`str` or `List[str]`, *optional*):
1396
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
1397
+ instead.
1398
+ image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
1399
+ `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
1400
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
1401
+ the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
1402
+ also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
1403
+ height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
1404
+ specified in init, images must be passed as a list such that each element of the list can be correctly
1405
+ batched for input to a single controlnet.
1406
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1407
+ The height in pixels of the generated image.
1408
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1409
+ The width in pixels of the generated image.
1410
+ strength (`float`, *optional*, defaults to 1.):
1411
+ Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
1412
+ between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
1413
+ `strength`. The number of denoising steps depends on the amount of noise initially added. When
1414
+ `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
1415
+ iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
1416
+ portion of the reference `image`.
1417
+ num_inference_steps (`int`, *optional*, defaults to 50):
1418
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1419
+ expense of slower inference.
1420
+ guidance_scale (`float`, *optional*, defaults to 7.5):
1421
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
1422
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
1423
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1424
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
1425
+ usually at the expense of lower image quality.
1426
+ negative_prompt (`str` or `List[str]`, *optional*):
1427
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
1428
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
1429
+ less than `1`).
1430
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
1431
+ The number of images to generate per prompt.
1432
+ eta (`float`, *optional*, defaults to 0.0):
1433
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
1434
+ [`schedulers.DDIMScheduler`], will be ignored for others.
1435
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
1436
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
1437
+ to make generation deterministic.
1438
+ latents (`torch.FloatTensor`, *optional*):
1439
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
1440
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1441
+ tensor will ge generated by sampling using the supplied random `generator`.
1442
+ prompt_embeds (`torch.FloatTensor`, *optional*):
1443
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1444
+ provided, text embeddings will be generated from `prompt` input argument.
1445
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1446
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1447
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1448
+ argument.
1449
+ output_type (`str`, *optional*, defaults to `"pil"`):
1450
+ The output format of the generate image. Choose between
1451
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
1452
+ return_dict (`bool`, *optional*, defaults to `True`):
1453
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1454
+ plain tuple.
1455
+ callback (`Callable`, *optional*):
1456
+ A function that will be called every `callback_steps` steps during inference. The function will be
1457
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
1458
+ callback_steps (`int`, *optional*, defaults to 1):
1459
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
1460
+ called at every step.
1461
+ cross_attention_kwargs (`dict`, *optional*):
1462
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1463
+ `self.processor` in
1464
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1465
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 0.5):
1466
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
1467
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
1468
+ corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
1469
+ than for [`~StableDiffusionControlNetPipeline.__call__`].
1470
+ guess_mode (`bool`, *optional*, defaults to `False`):
1471
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
1472
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
1473
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
1474
+ The percentage of total steps at which the controlnet starts applying.
1475
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
1476
+ The percentage of total steps at which the controlnet stops applying.
1477
+
1478
+ Examples:
1479
+
1480
+ Returns:
1481
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1482
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
1483
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
1484
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
1485
+ (nsfw) content, according to the `safety_checker`.
1486
+ """
1487
+ controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
1488
+
1489
+ # 0. Default height and width to unet
1490
+ height, width = self._default_height_width(height, width, image)
1491
+
1492
+ prompt = promptA
1493
+ negative_prompt = negative_promptA
1494
+
1495
+ # align format for control guidance
1496
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
1497
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
1498
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
1499
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
1500
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
1501
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
1502
+ control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
1503
+ control_guidance_end
1504
+ ]
1505
+
1506
+ # 1. Check inputs. Raise error if not correct
1507
+ self.check_inputs(
1508
+ prompt,
1509
+ control_image,
1510
+ height,
1511
+ width,
1512
+ callback_steps,
1513
+ negative_prompt,
1514
+ prompt_embeds,
1515
+ negative_prompt_embeds,
1516
+ controlnet_conditioning_scale,
1517
+ control_guidance_start,
1518
+ control_guidance_end,
1519
+ )
1520
+
1521
+ # 2. Define call parameters
1522
+ if prompt is not None and isinstance(prompt, str):
1523
+ batch_size = 1
1524
+ elif prompt is not None and isinstance(prompt, list):
1525
+ batch_size = len(prompt)
1526
+ else:
1527
+ batch_size = prompt_embeds.shape[0]
1528
+
1529
+ device = self._execution_device
1530
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1531
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1532
+ # corresponds to doing no classifier free guidance.
1533
+ do_classifier_free_guidance = guidance_scale > 1.0
1534
+
1535
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
1536
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
1537
+
1538
+ global_pool_conditions = (
1539
+ controlnet.config.global_pool_conditions
1540
+ if isinstance(controlnet, ControlNetModel)
1541
+ else controlnet.nets[0].config.global_pool_conditions
1542
+ )
1543
+ guess_mode = guess_mode or global_pool_conditions
1544
+
1545
+ # 3. Encode input prompt
1546
+ text_encoder_lora_scale = (
1547
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1548
+ )
1549
+ prompt_embeds = self._encode_prompt(
1550
+ promptA,
1551
+ promptB,
1552
+ tradoff,
1553
+ device,
1554
+ num_images_per_prompt,
1555
+ do_classifier_free_guidance,
1556
+ negative_promptA,
1557
+ negative_promptB,
1558
+ tradoff_nag,
1559
+ prompt_embeds=prompt_embeds,
1560
+ negative_prompt_embeds=negative_prompt_embeds,
1561
+ lora_scale=text_encoder_lora_scale,
1562
+ )
1563
+
1564
+ # 4. Prepare image
1565
+ if isinstance(controlnet, ControlNetModel):
1566
+ control_image = self.prepare_control_image(
1567
+ image=control_image,
1568
+ width=width,
1569
+ height=height,
1570
+ batch_size=batch_size * num_images_per_prompt,
1571
+ num_images_per_prompt=num_images_per_prompt,
1572
+ device=device,
1573
+ dtype=controlnet.dtype,
1574
+ do_classifier_free_guidance=do_classifier_free_guidance,
1575
+ guess_mode=guess_mode,
1576
+ )
1577
+ elif isinstance(controlnet, MultiControlNetModel):
1578
+ control_images = []
1579
+
1580
+ for control_image_ in control_image:
1581
+ control_image_ = self.prepare_control_image(
1582
+ image=control_image_,
1583
+ width=width,
1584
+ height=height,
1585
+ batch_size=batch_size * num_images_per_prompt,
1586
+ num_images_per_prompt=num_images_per_prompt,
1587
+ device=device,
1588
+ dtype=controlnet.dtype,
1589
+ do_classifier_free_guidance=do_classifier_free_guidance,
1590
+ guess_mode=guess_mode,
1591
+ )
1592
+
1593
+ control_images.append(control_image_)
1594
+
1595
+ control_image = control_images
1596
+ else:
1597
+ assert False
1598
+
1599
+ # 4. Preprocess mask and image - resizes image and mask w.r.t height and width
1600
+ mask, masked_image, init_image = prepare_mask_and_masked_image(
1601
+ image, mask_image, height, width, return_image=True
1602
+ )
1603
+
1604
+ # 5. Prepare timesteps
1605
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
1606
+ timesteps, num_inference_steps = self.get_timesteps(
1607
+ num_inference_steps=num_inference_steps, strength=strength, device=device
1608
+ )
1609
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1610
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1611
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1612
+ is_strength_max = strength == 1.0
1613
+
1614
+ # 6. Prepare latent variables
1615
+ num_channels_latents = self.vae.config.latent_channels
1616
+ num_channels_unet = self.unet.config.in_channels
1617
+ return_image_latents = num_channels_unet == 4
1618
+ latents_outputs = self.prepare_latents(
1619
+ batch_size * num_images_per_prompt,
1620
+ num_channels_latents,
1621
+ height,
1622
+ width,
1623
+ prompt_embeds.dtype,
1624
+ device,
1625
+ generator,
1626
+ latents,
1627
+ image=init_image,
1628
+ timestep=latent_timestep,
1629
+ is_strength_max=is_strength_max,
1630
+ return_noise=True,
1631
+ return_image_latents=return_image_latents,
1632
+ )
1633
+
1634
+ if return_image_latents:
1635
+ latents, noise, image_latents = latents_outputs
1636
+ else:
1637
+ latents, noise = latents_outputs
1638
+
1639
+ # 7. Prepare mask latent variables
1640
+ mask, masked_image_latents = self.prepare_mask_latents(
1641
+ mask,
1642
+ masked_image,
1643
+ batch_size * num_images_per_prompt,
1644
+ height,
1645
+ width,
1646
+ prompt_embeds.dtype,
1647
+ device,
1648
+ generator,
1649
+ do_classifier_free_guidance,
1650
+ )
1651
+
1652
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1653
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1654
+
1655
+ # 7.1 Create tensor stating which controlnets to keep
1656
+ controlnet_keep = []
1657
+ for i in range(len(timesteps)):
1658
+ keeps = [
1659
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1660
+ for s, e in zip(control_guidance_start, control_guidance_end)
1661
+ ]
1662
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
1663
+
1664
+ # 8. Denoising loop
1665
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1666
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1667
+ for i, t in enumerate(timesteps):
1668
+ # expand the latents if we are doing classifier free guidance
1669
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
1670
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1671
+
1672
+ # controlnet(s) inference
1673
+ if guess_mode and do_classifier_free_guidance:
1674
+ # Infer ControlNet only for the conditional batch.
1675
+ control_model_input = latents
1676
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1677
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1678
+ else:
1679
+ control_model_input = latent_model_input
1680
+ controlnet_prompt_embeds = prompt_embeds
1681
+
1682
+ if isinstance(controlnet_keep[i], list):
1683
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1684
+ else:
1685
+ controlnet_cond_scale = controlnet_conditioning_scale
1686
+ if isinstance(controlnet_cond_scale, list):
1687
+ controlnet_cond_scale = controlnet_cond_scale[0]
1688
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1689
+
1690
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1691
+ control_model_input,
1692
+ t,
1693
+ encoder_hidden_states=controlnet_prompt_embeds,
1694
+ controlnet_cond=control_image,
1695
+ conditioning_scale=cond_scale,
1696
+ guess_mode=guess_mode,
1697
+ return_dict=False,
1698
+ )
1699
+
1700
+ if guess_mode and do_classifier_free_guidance:
1701
+ # Infered ControlNet only for the conditional batch.
1702
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1703
+ # add 0 to the unconditional batch to keep it unchanged.
1704
+ down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
1705
+ mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
1706
+
1707
+ # predict the noise residual
1708
+ if num_channels_unet == 9:
1709
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
1710
+
1711
+ noise_pred = self.unet(
1712
+ latent_model_input,
1713
+ t,
1714
+ encoder_hidden_states=prompt_embeds,
1715
+ cross_attention_kwargs=cross_attention_kwargs,
1716
+ down_block_additional_residuals=down_block_res_samples,
1717
+ mid_block_additional_residual=mid_block_res_sample,
1718
+ return_dict=False,
1719
+ )[0]
1720
+
1721
+ # perform guidance
1722
+ if do_classifier_free_guidance:
1723
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1724
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1725
+
1726
+ # compute the previous noisy sample x_t -> x_t-1
1727
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1728
+
1729
+ if num_channels_unet == 4:
1730
+ init_latents_proper = image_latents[:1]
1731
+ init_mask = mask[:1]
1732
+
1733
+ if i < len(timesteps) - 1:
1734
+ noise_timestep = timesteps[i + 1]
1735
+ init_latents_proper = self.scheduler.add_noise(
1736
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1737
+ )
1738
+
1739
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1740
+
1741
+ # call the callback, if provided
1742
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1743
+ progress_bar.update()
1744
+ if callback is not None and i % callback_steps == 0:
1745
+ callback(i, t, latents)
1746
+
1747
+ # If we do sequential model offloading, let's offload unet and controlnet
1748
+ # manually for max memory savings
1749
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1750
+ self.unet.to("cpu")
1751
+ self.controlnet.to("cpu")
1752
+ torch.cuda.empty_cache()
1753
+
1754
+ if not output_type == "latent":
1755
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1756
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
1757
+ else:
1758
+ image = latents
1759
+ has_nsfw_concept = None
1760
+
1761
+ if has_nsfw_concept is None:
1762
+ do_denormalize = [True] * image.shape[0]
1763
+ else:
1764
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1765
+
1766
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1767
+
1768
+ # Offload last model to CPU
1769
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
1770
+ self.final_offload_hook.offload()
1771
+
1772
+ if not return_dict:
1773
+ return (image, has_nsfw_concept)
1774
+
1775
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
iopaint/model/power_paint/power_paint.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import PIL.Image
3
+ import cv2
4
+ import torch
5
+ from loguru import logger
6
+
7
+ from ..base import DiffusionInpaintModel
8
+ from ..helper.cpu_text_encoder import CPUTextEncoderWrapper
9
+ from ..utils import (
10
+ handle_from_pretrained_exceptions,
11
+ get_torch_dtype,
12
+ enable_low_mem,
13
+ is_local_files_only,
14
+ )
15
+ from iopaint.schema import InpaintRequest
16
+ from .powerpaint_tokenizer import add_task_to_prompt
17
+ from ...const import POWERPAINT_NAME
18
+
19
+
20
+ class PowerPaint(DiffusionInpaintModel):
21
+ name = POWERPAINT_NAME
22
+ pad_mod = 8
23
+ min_size = 512
24
+ lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
25
+
26
+ def init_model(self, device: torch.device, **kwargs):
27
+ from .pipeline_powerpaint import StableDiffusionInpaintPipeline
28
+ from .powerpaint_tokenizer import PowerPaintTokenizer
29
+
30
+ use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False))
31
+ model_kwargs = {"local_files_only": is_local_files_only(**kwargs)}
32
+ if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
33
+ logger.info("Disable Stable Diffusion Model NSFW checker")
34
+ model_kwargs.update(
35
+ dict(
36
+ safety_checker=None,
37
+ feature_extractor=None,
38
+ requires_safety_checker=False,
39
+ )
40
+ )
41
+
42
+ self.model = handle_from_pretrained_exceptions(
43
+ StableDiffusionInpaintPipeline.from_pretrained,
44
+ pretrained_model_name_or_path=self.name,
45
+ variant="fp16",
46
+ torch_dtype=torch_dtype,
47
+ **model_kwargs,
48
+ )
49
+ self.model.tokenizer = PowerPaintTokenizer(self.model.tokenizer)
50
+
51
+ enable_low_mem(self.model, kwargs.get("low_mem", False))
52
+
53
+ if kwargs.get("cpu_offload", False) and use_gpu:
54
+ logger.info("Enable sequential cpu offload")
55
+ self.model.enable_sequential_cpu_offload(gpu_id=0)
56
+ else:
57
+ self.model = self.model.to(device)
58
+ if kwargs["sd_cpu_textencoder"]:
59
+ logger.info("Run Stable Diffusion TextEncoder on CPU")
60
+ self.model.text_encoder = CPUTextEncoderWrapper(
61
+ self.model.text_encoder, torch_dtype
62
+ )
63
+
64
+ self.callback = kwargs.pop("callback", None)
65
+
66
+ def forward(self, image, mask, config: InpaintRequest):
67
+ """Input image and output image have same size
68
+ image: [H, W, C] RGB
69
+ mask: [H, W, 1] 255 means area to repaint
70
+ return: BGR IMAGE
71
+ """
72
+ self.set_scheduler(config)
73
+
74
+ img_h, img_w = image.shape[:2]
75
+ promptA, promptB, negative_promptA, negative_promptB = add_task_to_prompt(
76
+ config.prompt, config.negative_prompt, config.powerpaint_task
77
+ )
78
+
79
+ output = self.model(
80
+ image=PIL.Image.fromarray(image),
81
+ promptA=promptA,
82
+ promptB=promptB,
83
+ tradoff=config.fitting_degree,
84
+ tradoff_nag=config.fitting_degree,
85
+ negative_promptA=negative_promptA,
86
+ negative_promptB=negative_promptB,
87
+ mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
88
+ num_inference_steps=config.sd_steps,
89
+ strength=config.sd_strength,
90
+ guidance_scale=config.sd_guidance_scale,
91
+ output_type="np",
92
+ callback=self.callback,
93
+ height=img_h,
94
+ width=img_w,
95
+ generator=torch.manual_seed(config.sd_seed),
96
+ callback_steps=1,
97
+ ).images[0]
98
+
99
+ output = (output * 255).round().astype("uint8")
100
+ output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
101
+ return output
iopaint/model/sd.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PIL.Image
2
+ import cv2
3
+ import torch
4
+ from loguru import logger
5
+
6
+ from .base import DiffusionInpaintModel
7
+ from .helper.cpu_text_encoder import CPUTextEncoderWrapper
8
+ from .original_sd_configs import get_config_files
9
+ from .utils import (
10
+ handle_from_pretrained_exceptions,
11
+ get_torch_dtype,
12
+ enable_low_mem,
13
+ is_local_files_only,
14
+ )
15
+ from iopaint.schema import InpaintRequest, ModelType
16
+
17
+
18
+ class SD(DiffusionInpaintModel):
19
+ pad_mod = 8
20
+ min_size = 512
21
+ lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
22
+
23
+ def init_model(self, device: torch.device, **kwargs):
24
+ from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
25
+
26
+ use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False))
27
+
28
+ model_kwargs = {
29
+ **kwargs.get("pipe_components", {}),
30
+ "local_files_only": is_local_files_only(**kwargs),
31
+ }
32
+ disable_nsfw_checker = kwargs["disable_nsfw"] or kwargs.get(
33
+ "cpu_offload", False
34
+ )
35
+ if disable_nsfw_checker:
36
+ logger.info("Disable Stable Diffusion Model NSFW checker")
37
+ model_kwargs.update(
38
+ dict(
39
+ safety_checker=None,
40
+ feature_extractor=None,
41
+ requires_safety_checker=False,
42
+ )
43
+ )
44
+
45
+ if self.model_info.is_single_file_diffusers:
46
+ if self.model_info.model_type == ModelType.DIFFUSERS_SD:
47
+ model_kwargs["num_in_channels"] = 4
48
+ else:
49
+ model_kwargs["num_in_channels"] = 9
50
+
51
+ self.model = StableDiffusionInpaintPipeline.from_single_file(
52
+ self.model_id_or_path,
53
+ torch_dtype=torch_dtype,
54
+ load_safety_checker=not disable_nsfw_checker,
55
+ config_files=get_config_files(),
56
+ **model_kwargs,
57
+ )
58
+ else:
59
+ self.model = handle_from_pretrained_exceptions(
60
+ StableDiffusionInpaintPipeline.from_pretrained,
61
+ pretrained_model_name_or_path=self.model_id_or_path,
62
+ variant="fp16",
63
+ torch_dtype=torch_dtype,
64
+ **model_kwargs,
65
+ )
66
+
67
+ enable_low_mem(self.model, kwargs.get("low_mem", False))
68
+
69
+ if kwargs.get("cpu_offload", False) and use_gpu:
70
+ logger.info("Enable sequential cpu offload")
71
+ self.model.enable_sequential_cpu_offload(gpu_id=0)
72
+ else:
73
+ self.model = self.model.to(device)
74
+ if kwargs["sd_cpu_textencoder"]:
75
+ logger.info("Run Stable Diffusion TextEncoder on CPU")
76
+ self.model.text_encoder = CPUTextEncoderWrapper(
77
+ self.model.text_encoder, torch_dtype
78
+ )
79
+
80
+ self.callback = kwargs.pop("callback", None)
81
+
82
+ def forward(self, image, mask, config: InpaintRequest):
83
+ """Input image and output image have same size
84
+ image: [H, W, C] RGB
85
+ mask: [H, W, 1] 255 means area to repaint
86
+ return: BGR IMAGE
87
+ """
88
+ self.set_scheduler(config)
89
+
90
+ img_h, img_w = image.shape[:2]
91
+
92
+ output = self.model(
93
+ image=PIL.Image.fromarray(image),
94
+ prompt=config.prompt,
95
+ negative_prompt=config.negative_prompt,
96
+ mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
97
+ num_inference_steps=config.sd_steps,
98
+ strength=config.sd_strength,
99
+ guidance_scale=config.sd_guidance_scale,
100
+ output_type="np",
101
+ callback_on_step_end=self.callback,
102
+ height=img_h,
103
+ width=img_w,
104
+ generator=torch.manual_seed(config.sd_seed),
105
+ ).images[0]
106
+
107
+ output = (output * 255).round().astype("uint8")
108
+ output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
109
+ return output
110
+
111
+
112
+ class SD15(SD):
113
+ name = "runwayml/stable-diffusion-inpainting"
114
+ model_id_or_path = "runwayml/stable-diffusion-inpainting"
115
+
116
+
117
+ class Anything4(SD):
118
+ name = "Sanster/anything-4.0-inpainting"
119
+ model_id_or_path = "Sanster/anything-4.0-inpainting"
120
+
121
+
122
+ class RealisticVision14(SD):
123
+ name = "Sanster/Realistic_Vision_V1.4-inpainting"
124
+ model_id_or_path = "Sanster/Realistic_Vision_V1.4-inpainting"
125
+
126
+
127
+ class SD2(SD):
128
+ name = "stabilityai/stable-diffusion-2-inpainting"
129
+ model_id_or_path = "stabilityai/stable-diffusion-2-inpainting"
iopaint/model/sdxl.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import PIL.Image
4
+ import cv2
5
+ import torch
6
+ from diffusers import AutoencoderKL
7
+ from loguru import logger
8
+
9
+ from iopaint.schema import InpaintRequest, ModelType
10
+
11
+ from .base import DiffusionInpaintModel
12
+ from .helper.cpu_text_encoder import CPUTextEncoderWrapper
13
+ from .original_sd_configs import get_config_files
14
+ from .utils import (
15
+ handle_from_pretrained_exceptions,
16
+ get_torch_dtype,
17
+ enable_low_mem,
18
+ is_local_files_only,
19
+ )
20
+
21
+
22
+ class SDXL(DiffusionInpaintModel):
23
+ name = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
24
+ pad_mod = 8
25
+ min_size = 512
26
+ lcm_lora_id = "latent-consistency/lcm-lora-sdxl"
27
+ model_id_or_path = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
28
+
29
+ def init_model(self, device: torch.device, **kwargs):
30
+ from diffusers.pipelines import StableDiffusionXLInpaintPipeline
31
+
32
+ use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False))
33
+
34
+ if self.model_info.model_type == ModelType.DIFFUSERS_SDXL:
35
+ num_in_channels = 4
36
+ else:
37
+ num_in_channels = 9
38
+
39
+ if os.path.isfile(self.model_id_or_path):
40
+ self.model = StableDiffusionXLInpaintPipeline.from_single_file(
41
+ self.model_id_or_path,
42
+ torch_dtype=torch_dtype,
43
+ num_in_channels=num_in_channels,
44
+ load_safety_checker=False,
45
+ config_files=get_config_files()
46
+ )
47
+ else:
48
+ model_kwargs = {
49
+ **kwargs.get("pipe_components", {}),
50
+ "local_files_only": is_local_files_only(**kwargs),
51
+ }
52
+ if "vae" not in model_kwargs:
53
+ vae = AutoencoderKL.from_pretrained(
54
+ "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype
55
+ )
56
+ model_kwargs["vae"] = vae
57
+ self.model = handle_from_pretrained_exceptions(
58
+ StableDiffusionXLInpaintPipeline.from_pretrained,
59
+ pretrained_model_name_or_path=self.model_id_or_path,
60
+ torch_dtype=torch_dtype,
61
+ variant="fp16",
62
+ **model_kwargs
63
+ )
64
+
65
+ enable_low_mem(self.model, kwargs.get("low_mem", False))
66
+
67
+ if kwargs.get("cpu_offload", False) and use_gpu:
68
+ logger.info("Enable sequential cpu offload")
69
+ self.model.enable_sequential_cpu_offload(gpu_id=0)
70
+ else:
71
+ self.model = self.model.to(device)
72
+ if kwargs["sd_cpu_textencoder"]:
73
+ logger.info("Run Stable Diffusion TextEncoder on CPU")
74
+ self.model.text_encoder = CPUTextEncoderWrapper(
75
+ self.model.text_encoder, torch_dtype
76
+ )
77
+ self.model.text_encoder_2 = CPUTextEncoderWrapper(
78
+ self.model.text_encoder_2, torch_dtype
79
+ )
80
+
81
+ self.callback = kwargs.pop("callback", None)
82
+
83
+ def forward(self, image, mask, config: InpaintRequest):
84
+ """Input image and output image have same size
85
+ image: [H, W, C] RGB
86
+ mask: [H, W, 1] 255 means area to repaint
87
+ return: BGR IMAGE
88
+ """
89
+ self.set_scheduler(config)
90
+
91
+ img_h, img_w = image.shape[:2]
92
+
93
+ output = self.model(
94
+ image=PIL.Image.fromarray(image),
95
+ prompt=config.prompt,
96
+ negative_prompt=config.negative_prompt,
97
+ mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
98
+ num_inference_steps=config.sd_steps,
99
+ strength=0.999 if config.sd_strength == 1.0 else config.sd_strength,
100
+ guidance_scale=config.sd_guidance_scale,
101
+ output_type="np",
102
+ callback_on_step_end=self.callback,
103
+ height=img_h,
104
+ width=img_w,
105
+ generator=torch.manual_seed(config.sd_seed),
106
+ ).images[0]
107
+
108
+ output = (output * 255).round().astype("uint8")
109
+ output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
110
+ return output
iopaint/plugins/realesrgan.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ from loguru import logger
5
+
6
+ from iopaint.helper import download_model
7
+ from iopaint.plugins.base_plugin import BasePlugin
8
+ from iopaint.schema import RunPluginRequest, RealESRGANModel
9
+
10
+
11
+ class RealESRGANUpscaler(BasePlugin):
12
+ name = "RealESRGAN"
13
+ support_gen_image = True
14
+
15
+ def __init__(self, name, device, no_half=False):
16
+ super().__init__()
17
+ self.model_name = name
18
+ self.device = device
19
+ self.no_half = no_half
20
+ self._init_model(name)
21
+
22
+ def _init_model(self, name):
23
+ from basicsr.archs.rrdbnet_arch import RRDBNet
24
+ from realesrgan import RealESRGANer
25
+ from realesrgan.archs.srvgg_arch import SRVGGNetCompact
26
+
27
+ REAL_ESRGAN_MODELS = {
28
+ RealESRGANModel.realesr_general_x4v3: {
29
+ "url": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth",
30
+ "scale": 4,
31
+ "model": lambda: SRVGGNetCompact(
32
+ num_in_ch=3,
33
+ num_out_ch=3,
34
+ num_feat=64,
35
+ num_conv=32,
36
+ upscale=4,
37
+ act_type="prelu",
38
+ ),
39
+ "model_md5": "91a7644643c884ee00737db24e478156",
40
+ },
41
+ RealESRGANModel.RealESRGAN_x4plus: {
42
+ "url": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",
43
+ "scale": 4,
44
+ "model": lambda: RRDBNet(
45
+ num_in_ch=3,
46
+ num_out_ch=3,
47
+ num_feat=64,
48
+ num_block=23,
49
+ num_grow_ch=32,
50
+ scale=4,
51
+ ),
52
+ "model_md5": "99ec365d4afad750833258a1a24f44ca",
53
+ },
54
+ RealESRGANModel.RealESRGAN_x4plus_anime_6B: {
55
+ "url": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth",
56
+ "scale": 4,
57
+ "model": lambda: RRDBNet(
58
+ num_in_ch=3,
59
+ num_out_ch=3,
60
+ num_feat=64,
61
+ num_block=6,
62
+ num_grow_ch=32,
63
+ scale=4,
64
+ ),
65
+ "model_md5": "d58ce384064ec1591c2ea7b79dbf47ba",
66
+ },
67
+ }
68
+ if name not in REAL_ESRGAN_MODELS:
69
+ raise ValueError(f"Unknown RealESRGAN model name: {name}")
70
+ model_info = REAL_ESRGAN_MODELS[name]
71
+
72
+ model_path = download_model(model_info["url"], model_info["model_md5"])
73
+ logger.info(f"RealESRGAN model path: {model_path}")
74
+
75
+ self.model = RealESRGANer(
76
+ scale=model_info["scale"],
77
+ model_path=model_path,
78
+ model=model_info["model"](),
79
+ half=True if "cuda" in str(self.device) and not self.no_half else False,
80
+ tile=512,
81
+ tile_pad=10,
82
+ pre_pad=10,
83
+ device=self.device,
84
+ )
85
+
86
+ def switch_model(self, new_model_name: str):
87
+ if self.model_name == new_model_name:
88
+ return
89
+ self._init_model(new_model_name)
90
+ self.model_name = new_model_name
91
+
92
+ def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray:
93
+ bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR)
94
+ logger.info(f"RealESRGAN input shape: {bgr_np_img.shape}, scale: {req.scale}")
95
+ result = self.forward(bgr_np_img, req.scale)
96
+ logger.info(f"RealESRGAN output shape: {result.shape}")
97
+ return result
98
+
99
+ @torch.inference_mode()
100
+ def forward(self, bgr_np_img, scale: float):
101
+ # 输出是 BGR
102
+ upsampled = self.model.enhance(bgr_np_img, outscale=scale)[0]
103
+ return upsampled
104
+
105
+ def check_dep(self):
106
+ try:
107
+ import realesrgan
108
+ except ImportError:
109
+ return "RealESRGAN is not installed, please install it first. pip install realesrgan"
iopaint/plugins/remove_bg.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ from loguru import logger
5
+ from torch.hub import get_dir
6
+
7
+ from iopaint.plugins.base_plugin import BasePlugin
8
+ from iopaint.schema import RunPluginRequest, RemoveBGModel
9
+
10
+
11
+ class RemoveBG(BasePlugin):
12
+ name = "RemoveBG"
13
+ support_gen_mask = True
14
+ support_gen_image = True
15
+
16
+ def __init__(self, model_name):
17
+ super().__init__()
18
+ self.model_name = model_name
19
+
20
+ hub_dir = get_dir()
21
+ model_dir = os.path.join(hub_dir, "checkpoints")
22
+ os.environ["U2NET_HOME"] = model_dir
23
+
24
+ self._init_session(model_name)
25
+
26
+ def _init_session(self, model_name: str):
27
+ if model_name == RemoveBGModel.briaai_rmbg_1_4:
28
+ from iopaint.plugins.briarmbg import (
29
+ create_briarmbg_session,
30
+ briarmbg_process,
31
+ )
32
+
33
+ self.session = create_briarmbg_session()
34
+ self.remove = briarmbg_process
35
+ else:
36
+ from rembg import new_session, remove
37
+
38
+ self.session = new_session(model_name=model_name)
39
+ self.remove = remove
40
+
41
+ def switch_model(self, new_model_name):
42
+ if self.model_name == new_model_name:
43
+ return
44
+
45
+ logger.info(
46
+ f"Switching removebg model from {self.model_name} to {new_model_name}"
47
+ )
48
+ self._init_session(new_model_name)
49
+ self.model_name = new_model_name
50
+
51
+ def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray:
52
+ bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR)
53
+
54
+ # return BGRA image
55
+ output = self.remove(bgr_np_img, session=self.session)
56
+ return cv2.cvtColor(output, cv2.COLOR_BGRA2RGBA)
57
+
58
+ def gen_mask(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray:
59
+ bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR)
60
+
61
+ # return BGR image, 255 means foreground, 0 means background
62
+ output = self.remove(bgr_np_img, session=self.session, only_mask=True)
63
+ return output
64
+
65
+ def check_dep(self):
66
+ try:
67
+ import rembg
68
+ except ImportError:
69
+ return (
70
+ "RemoveBG is not installed, please install it first. pip install rembg"
71
+ )
iopaint/plugins/restoreformer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from loguru import logger
4
+
5
+ from iopaint.helper import download_model
6
+ from iopaint.plugins.base_plugin import BasePlugin
7
+ from iopaint.schema import RunPluginRequest
8
+
9
+
10
+ class RestoreFormerPlugin(BasePlugin):
11
+ name = "RestoreFormer"
12
+ support_gen_image = True
13
+
14
+ def __init__(self, device, upscaler=None):
15
+ super().__init__()
16
+ from .gfpganer import MyGFPGANer
17
+
18
+ url = "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/RestoreFormer.pth"
19
+ model_md5 = "eaeeff6c4a1caa1673977cb374e6f699"
20
+ model_path = download_model(url, model_md5)
21
+ logger.info(f"RestoreFormer model path: {model_path}")
22
+
23
+ import facexlib
24
+
25
+ if hasattr(facexlib.detection.retinaface, "device"):
26
+ facexlib.detection.retinaface.device = device
27
+
28
+ self.face_enhancer = MyGFPGANer(
29
+ model_path=model_path,
30
+ upscale=1,
31
+ arch="RestoreFormer",
32
+ channel_multiplier=2,
33
+ device=device,
34
+ bg_upsampler=upscaler.model if upscaler is not None else None,
35
+ )
36
+
37
+ def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray:
38
+ weight = 0.5
39
+ bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR)
40
+ logger.info(f"RestoreFormer input shape: {bgr_np_img.shape}")
41
+ _, _, bgr_output = self.face_enhancer.enhance(
42
+ bgr_np_img,
43
+ has_aligned=False,
44
+ only_center_face=False,
45
+ paste_back=True,
46
+ weight=weight,
47
+ )
48
+ logger.info(f"RestoreFormer output shape: {bgr_output.shape}")
49
+ return bgr_output
50
+
51
+ def check_dep(self):
52
+ try:
53
+ import gfpgan
54
+ except ImportError:
55
+ return (
56
+ "gfpgan is not installed, please install it first. pip install gfpgan"
57
+ )
iopaint/plugins/segment_anything/modeling/prompt_encoder.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch import nn
10
+
11
+ from typing import Any, Optional, Tuple, Type
12
+
13
+ from .common import LayerNorm2d
14
+
15
+
16
+ class PromptEncoder(nn.Module):
17
+ def __init__(
18
+ self,
19
+ embed_dim: int,
20
+ image_embedding_size: Tuple[int, int],
21
+ input_image_size: Tuple[int, int],
22
+ mask_in_chans: int,
23
+ activation: Type[nn.Module] = nn.GELU,
24
+ ) -> None:
25
+ """
26
+ Encodes prompts for input to SAM's mask decoder.
27
+
28
+ Arguments:
29
+ embed_dim (int): The prompts' embedding dimension
30
+ image_embedding_size (tuple(int, int)): The spatial size of the
31
+ image embedding, as (H, W).
32
+ input_image_size (int): The padded size of the image as input
33
+ to the image encoder, as (H, W).
34
+ mask_in_chans (int): The number of hidden channels used for
35
+ encoding input masks.
36
+ activation (nn.Module): The activation to use when encoding
37
+ input masks.
38
+ """
39
+ super().__init__()
40
+ self.embed_dim = embed_dim
41
+ self.input_image_size = input_image_size
42
+ self.image_embedding_size = image_embedding_size
43
+ self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
44
+
45
+ self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners
46
+ point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
47
+ self.point_embeddings = nn.ModuleList(point_embeddings)
48
+ self.not_a_point_embed = nn.Embedding(1, embed_dim)
49
+
50
+ self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
51
+ self.mask_downscaling = nn.Sequential(
52
+ nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
53
+ LayerNorm2d(mask_in_chans // 4),
54
+ activation(),
55
+ nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
56
+ LayerNorm2d(mask_in_chans),
57
+ activation(),
58
+ nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
59
+ )
60
+ self.no_mask_embed = nn.Embedding(1, embed_dim)
61
+
62
+ def get_dense_pe(self) -> torch.Tensor:
63
+ """
64
+ Returns the positional encoding used to encode point prompts,
65
+ applied to a dense set of points the shape of the image encoding.
66
+
67
+ Returns:
68
+ torch.Tensor: Positional encoding with shape
69
+ 1x(embed_dim)x(embedding_h)x(embedding_w)
70
+ """
71
+ return self.pe_layer(self.image_embedding_size).unsqueeze(0)
72
+
73
+ def _embed_points(
74
+ self,
75
+ points: torch.Tensor,
76
+ labels: torch.Tensor,
77
+ pad: bool,
78
+ ) -> torch.Tensor:
79
+ """Embeds point prompts."""
80
+ points = points + 0.5 # Shift to center of pixel
81
+ if pad:
82
+ padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
83
+ padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
84
+ points = torch.cat([points, padding_point], dim=1)
85
+ labels = torch.cat([labels, padding_label], dim=1)
86
+ point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
87
+ point_embedding[labels == -1] = 0.0
88
+ point_embedding[labels == -1] += self.not_a_point_embed.weight
89
+ point_embedding[labels == 0] += self.point_embeddings[0].weight
90
+ point_embedding[labels == 1] += self.point_embeddings[1].weight
91
+ return point_embedding
92
+
93
+ def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
94
+ """Embeds box prompts."""
95
+ boxes = boxes + 0.5 # Shift to center of pixel
96
+ coords = boxes.reshape(-1, 2, 2)
97
+ corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
98
+ corner_embedding[:, 0, :] += self.point_embeddings[2].weight
99
+ corner_embedding[:, 1, :] += self.point_embeddings[3].weight
100
+ return corner_embedding
101
+
102
+ def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
103
+ """Embeds mask inputs."""
104
+ mask_embedding = self.mask_downscaling(masks)
105
+ return mask_embedding
106
+
107
+ def _get_batch_size(
108
+ self,
109
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
110
+ boxes: Optional[torch.Tensor],
111
+ masks: Optional[torch.Tensor],
112
+ ) -> int:
113
+ """
114
+ Gets the batch size of the output given the batch size of the input prompts.
115
+ """
116
+ if points is not None:
117
+ return points[0].shape[0]
118
+ elif boxes is not None:
119
+ return boxes.shape[0]
120
+ elif masks is not None:
121
+ return masks.shape[0]
122
+ else:
123
+ return 1
124
+
125
+ def _get_device(self) -> torch.device:
126
+ return self.point_embeddings[0].weight.device
127
+
128
+ def forward(
129
+ self,
130
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
131
+ boxes: Optional[torch.Tensor],
132
+ masks: Optional[torch.Tensor],
133
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
134
+ """
135
+ Embeds different types of prompts, returning both sparse and dense
136
+ embeddings.
137
+
138
+ Arguments:
139
+ points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
140
+ and labels to embed.
141
+ boxes (torch.Tensor or none): boxes to embed
142
+ masks (torch.Tensor or none): masks to embed
143
+
144
+ Returns:
145
+ torch.Tensor: sparse embeddings for the points and boxes, with shape
146
+ BxNx(embed_dim), where N is determined by the number of input points
147
+ and boxes.
148
+ torch.Tensor: dense embeddings for the masks, in the shape
149
+ Bx(embed_dim)x(embed_H)x(embed_W)
150
+ """
151
+ bs = self._get_batch_size(points, boxes, masks)
152
+ sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
153
+ if points is not None:
154
+ coords, labels = points
155
+ point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
156
+ sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
157
+ if boxes is not None:
158
+ box_embeddings = self._embed_boxes(boxes)
159
+ sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
160
+
161
+ if masks is not None:
162
+ dense_embeddings = self._embed_masks(masks)
163
+ else:
164
+ dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
165
+ bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
166
+ )
167
+
168
+ return sparse_embeddings, dense_embeddings
169
+
170
+
171
+ class PositionEmbeddingRandom(nn.Module):
172
+ """
173
+ Positional encoding using random spatial frequencies.
174
+ """
175
+
176
+ def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
177
+ super().__init__()
178
+ if scale is None or scale <= 0.0:
179
+ scale = 1.0
180
+ self.register_buffer(
181
+ "positional_encoding_gaussian_matrix",
182
+ scale * torch.randn((2, num_pos_feats)),
183
+ )
184
+
185
+ def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
186
+ """Positionally encode points that are normalized to [0,1]."""
187
+ # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
188
+ coords = 2 * coords - 1
189
+ coords = coords @ self.positional_encoding_gaussian_matrix
190
+ coords = 2 * np.pi * coords
191
+ # outputs d_1 x ... x d_n x C shape
192
+ return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
193
+
194
+ def forward(self, size: Tuple[int, int]) -> torch.Tensor:
195
+ """Generate positional encoding for a grid of the specified size."""
196
+ h, w = size
197
+ device: Any = self.positional_encoding_gaussian_matrix.device
198
+ grid = torch.ones((h, w), device=device, dtype=torch.float32)
199
+ y_embed = grid.cumsum(dim=0) - 0.5
200
+ x_embed = grid.cumsum(dim=1) - 0.5
201
+ y_embed = y_embed / h
202
+ x_embed = x_embed / w
203
+
204
+ pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
205
+ return pe.permute(2, 0, 1) # C x H x W
206
+
207
+ def forward_with_coords(
208
+ self, coords_input: torch.Tensor, image_size: Tuple[int, int]
209
+ ) -> torch.Tensor:
210
+ """Positionally encode points that are not normalized to [0,1]."""
211
+ coords = coords_input.clone()
212
+ coords[:, :, 0] = coords[:, :, 0] / image_size[1]
213
+ coords[:, :, 1] = coords[:, :, 1] / image_size[0]
214
+ return self._pe_encoding(coords.to(torch.float)) # B x N x C
iopaint/plugins/segment_anything/modeling/sam.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import functional as F
10
+
11
+ from typing import Any, Dict, List, Tuple
12
+
13
+ from .image_encoder import ImageEncoderViT
14
+ from .mask_decoder import MaskDecoder
15
+ from .prompt_encoder import PromptEncoder
16
+
17
+
18
+ class Sam(nn.Module):
19
+ mask_threshold: float = 0.0
20
+ image_format: str = "RGB"
21
+
22
+ def __init__(
23
+ self,
24
+ image_encoder: ImageEncoderViT,
25
+ prompt_encoder: PromptEncoder,
26
+ mask_decoder: MaskDecoder,
27
+ pixel_mean: List[float] = [123.675, 116.28, 103.53],
28
+ pixel_std: List[float] = [58.395, 57.12, 57.375],
29
+ ) -> None:
30
+ """
31
+ SAM predicts object masks from an image and input prompts.
32
+
33
+ Arguments:
34
+ image_encoder (ImageEncoderViT): The backbone used to encode the
35
+ image into image embeddings that allow for efficient mask prediction.
36
+ prompt_encoder (PromptEncoder): Encodes various types of input prompts.
37
+ mask_decoder (MaskDecoder): Predicts masks from the image embeddings
38
+ and encoded prompts.
39
+ pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
40
+ pixel_std (list(float)): Std values for normalizing pixels in the input image.
41
+ """
42
+ super().__init__()
43
+ self.image_encoder = image_encoder
44
+ self.prompt_encoder = prompt_encoder
45
+ self.mask_decoder = mask_decoder
46
+ self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
47
+ self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
48
+
49
+ @property
50
+ def device(self) -> Any:
51
+ return self.pixel_mean.device
52
+
53
+ @torch.no_grad()
54
+ def forward(
55
+ self,
56
+ batched_input: List[Dict[str, Any]],
57
+ multimask_output: bool,
58
+ ) -> List[Dict[str, torch.Tensor]]:
59
+ """
60
+ Predicts masks end-to-end from provided images and prompts.
61
+ If prompts are not known in advance, using SamPredictor is
62
+ recommended over calling the model directly.
63
+
64
+ Arguments:
65
+ batched_input (list(dict)): A list over input images, each a
66
+ dictionary with the following keys. A prompt key can be
67
+ excluded if it is not present.
68
+ 'image': The image as a torch tensor in 3xHxW format,
69
+ already transformed for input to the model.
70
+ 'original_size': (tuple(int, int)) The original size of
71
+ the image before transformation, as (H, W).
72
+ 'point_coords': (torch.Tensor) Batched point prompts for
73
+ this image, with shape BxNx2. Already transformed to the
74
+ input frame of the model.
75
+ 'point_labels': (torch.Tensor) Batched labels for point prompts,
76
+ with shape BxN.
77
+ 'boxes': (torch.Tensor) Batched box inputs, with shape Bx4.
78
+ Already transformed to the input frame of the model.
79
+ 'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
80
+ in the form Bx1xHxW.
81
+ multimask_output (bool): Whether the model should predict multiple
82
+ disambiguating masks, or return a single mask.
83
+
84
+ Returns:
85
+ (list(dict)): A list over input images, where each element is
86
+ as dictionary with the following keys.
87
+ 'masks': (torch.Tensor) Batched binary mask predictions,
88
+ with shape BxCxHxW, where B is the number of input promts,
89
+ C is determiend by multimask_output, and (H, W) is the
90
+ original size of the image.
91
+ 'iou_predictions': (torch.Tensor) The model's predictions
92
+ of mask quality, in shape BxC.
93
+ 'low_res_logits': (torch.Tensor) Low resolution logits with
94
+ shape BxCxHxW, where H=W=256. Can be passed as mask input
95
+ to subsequent iterations of prediction.
96
+ """
97
+ input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0)
98
+ image_embeddings = self.image_encoder(input_images)
99
+
100
+ outputs = []
101
+ for image_record, curr_embedding in zip(batched_input, image_embeddings):
102
+ if "point_coords" in image_record:
103
+ points = (image_record["point_coords"], image_record["point_labels"])
104
+ else:
105
+ points = None
106
+ sparse_embeddings, dense_embeddings = self.prompt_encoder(
107
+ points=points,
108
+ boxes=image_record.get("boxes", None),
109
+ masks=image_record.get("mask_inputs", None),
110
+ )
111
+ low_res_masks, iou_predictions = self.mask_decoder(
112
+ image_embeddings=curr_embedding.unsqueeze(0),
113
+ image_pe=self.prompt_encoder.get_dense_pe(),
114
+ sparse_prompt_embeddings=sparse_embeddings,
115
+ dense_prompt_embeddings=dense_embeddings,
116
+ multimask_output=multimask_output,
117
+ )
118
+ masks = self.postprocess_masks(
119
+ low_res_masks,
120
+ input_size=image_record["image"].shape[-2:],
121
+ original_size=image_record["original_size"],
122
+ )
123
+ masks = masks > self.mask_threshold
124
+ outputs.append(
125
+ {
126
+ "masks": masks,
127
+ "iou_predictions": iou_predictions,
128
+ "low_res_logits": low_res_masks,
129
+ }
130
+ )
131
+ return outputs
132
+
133
+ def postprocess_masks(
134
+ self,
135
+ masks: torch.Tensor,
136
+ input_size: Tuple[int, ...],
137
+ original_size: Tuple[int, ...],
138
+ ) -> torch.Tensor:
139
+ """
140
+ Remove padding and upscale masks to the original image size.
141
+
142
+ Arguments:
143
+ masks (torch.Tensor): Batched masks from the mask_decoder,
144
+ in BxCxHxW format.
145
+ input_size (tuple(int, int)): The size of the image input to the
146
+ model, in (H, W) format. Used to remove padding.
147
+ original_size (tuple(int, int)): The original size of the image
148
+ before resizing for input to the model, in (H, W) format.
149
+
150
+ Returns:
151
+ (torch.Tensor): Batched masks in BxCxHxW format, where (H, W)
152
+ is given by original_size.
153
+ """
154
+ masks = F.interpolate(
155
+ masks,
156
+ (self.image_encoder.img_size, self.image_encoder.img_size),
157
+ mode="bilinear",
158
+ align_corners=False,
159
+ )
160
+ masks = masks[..., : input_size[0], : input_size[1]]
161
+ masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False)
162
+ return masks
163
+
164
+ def preprocess(self, x: torch.Tensor) -> torch.Tensor:
165
+ """Normalize pixel values and pad to a square input."""
166
+ # Normalize colors
167
+ x = (x - self.pixel_mean) / self.pixel_std
168
+
169
+ # Pad
170
+ h, w = x.shape[-2:]
171
+ padh = self.image_encoder.img_size - h
172
+ padw = self.image_encoder.img_size - w
173
+ x = F.pad(x, (0, padw, 0, padh))
174
+ return x
iopaint/plugins/segment_anything/predictor.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import numpy as np
8
+ import torch
9
+
10
+ from .modeling import Sam
11
+
12
+ from typing import Optional, Tuple
13
+
14
+
15
+ class SamPredictor:
16
+ def __init__(
17
+ self,
18
+ sam_model: Sam,
19
+ ) -> None:
20
+ """
21
+ Uses SAM to calculate the image embedding for an image, and then
22
+ allow repeated, efficient mask prediction given prompts.
23
+
24
+ Arguments:
25
+ sam_model (Sam): The model to use for mask prediction.
26
+ """
27
+ super().__init__()
28
+ self.model = sam_model
29
+ from .utils.transforms import ResizeLongestSide
30
+
31
+ self.transform = ResizeLongestSide(sam_model.image_encoder.img_size)
32
+ self.reset_image()
33
+
34
+ def set_image(
35
+ self,
36
+ image: np.ndarray,
37
+ image_format: str = "RGB",
38
+ ) -> None:
39
+ """
40
+ Calculates the image embeddings for the provided image, allowing
41
+ masks to be predicted with the 'predict' method.
42
+
43
+ Arguments:
44
+ image (np.ndarray): The image for calculating masks. Expects an
45
+ image in HWC uint8 format, with pixel values in [0, 255].
46
+ image_format (str): The color format of the image, in ['RGB', 'BGR'].
47
+ """
48
+ assert image_format in [
49
+ "RGB",
50
+ "BGR",
51
+ ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
52
+ if image_format != self.model.image_format:
53
+ image = image[..., ::-1]
54
+
55
+ # Transform the image to the form expected by the model
56
+ input_image = self.transform.apply_image(image)
57
+ input_image_torch = torch.as_tensor(input_image, device=self.device)
58
+ input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[
59
+ None, :, :, :
60
+ ]
61
+
62
+ self.set_torch_image(input_image_torch, image.shape[:2])
63
+
64
+ @torch.no_grad()
65
+ def set_torch_image(
66
+ self,
67
+ transformed_image: torch.Tensor,
68
+ original_image_size: Tuple[int, ...],
69
+ ) -> None:
70
+ """
71
+ Calculates the image embeddings for the provided image, allowing
72
+ masks to be predicted with the 'predict' method. Expects the input
73
+ image to be already transformed to the format expected by the model.
74
+
75
+ Arguments:
76
+ transformed_image (torch.Tensor): The input image, with shape
77
+ 1x3xHxW, which has been transformed with ResizeLongestSide.
78
+ original_image_size (tuple(int, int)): The size of the image
79
+ before transformation, in (H, W) format.
80
+ """
81
+ assert (
82
+ len(transformed_image.shape) == 4
83
+ and transformed_image.shape[1] == 3
84
+ and max(*transformed_image.shape[2:]) == self.model.image_encoder.img_size
85
+ ), f"set_torch_image input must be BCHW with long side {self.model.image_encoder.img_size}."
86
+ self.reset_image()
87
+
88
+ self.original_size = original_image_size
89
+ self.input_size = tuple(transformed_image.shape[-2:])
90
+ input_image = self.model.preprocess(transformed_image)
91
+ self.features = self.model.image_encoder(input_image)
92
+ self.is_image_set = True
93
+
94
+ def predict(
95
+ self,
96
+ point_coords: Optional[np.ndarray] = None,
97
+ point_labels: Optional[np.ndarray] = None,
98
+ box: Optional[np.ndarray] = None,
99
+ mask_input: Optional[np.ndarray] = None,
100
+ multimask_output: bool = True,
101
+ return_logits: bool = False,
102
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
103
+ """
104
+ Predict masks for the given input prompts, using the currently set image.
105
+
106
+ Arguments:
107
+ point_coords (np.ndarray or None): A Nx2 array of point prompts to the
108
+ model. Each point is in (X,Y) in pixels.
109
+ point_labels (np.ndarray or None): A length N array of labels for the
110
+ point prompts. 1 indicates a foreground point and 0 indicates a
111
+ background point.
112
+ box (np.ndarray or None): A length 4 array given a box prompt to the
113
+ model, in XYXY format.
114
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
115
+ coming from a previous prediction iteration. Has form 1xHxW, where
116
+ for SAM, H=W=256.
117
+ multimask_output (bool): If true, the model will return three masks.
118
+ For ambiguous input prompts (such as a single click), this will often
119
+ produce better masks than a single prediction. If only a single
120
+ mask is needed, the model's predicted quality score can be used
121
+ to select the best mask. For non-ambiguous prompts, such as multiple
122
+ input prompts, multimask_output=False can give better results.
123
+ return_logits (bool): If true, returns un-thresholded masks logits
124
+ instead of a binary mask.
125
+
126
+ Returns:
127
+ (np.ndarray): The output masks in CxHxW format, where C is the
128
+ number of masks, and (H, W) is the original image size.
129
+ (np.ndarray): An array of length C containing the model's
130
+ predictions for the quality of each mask.
131
+ (np.ndarray): An array of shape CxHxW, where C is the number
132
+ of masks and H=W=256. These low resolution logits can be passed to
133
+ a subsequent iteration as mask input.
134
+ """
135
+ if not self.is_image_set:
136
+ raise RuntimeError(
137
+ "An image must be set with .set_image(...) before mask prediction."
138
+ )
139
+
140
+ # Transform input prompts
141
+ coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
142
+ if point_coords is not None:
143
+ assert (
144
+ point_labels is not None
145
+ ), "point_labels must be supplied if point_coords is supplied."
146
+ point_coords = self.transform.apply_coords(point_coords, self.original_size)
147
+ coords_torch = torch.as_tensor(
148
+ point_coords, dtype=torch.float, device=self.device
149
+ )
150
+ labels_torch = torch.as_tensor(
151
+ point_labels, dtype=torch.int, device=self.device
152
+ )
153
+ coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
154
+ if box is not None:
155
+ box = self.transform.apply_boxes(box, self.original_size)
156
+ box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device)
157
+ box_torch = box_torch[None, :]
158
+ if mask_input is not None:
159
+ mask_input_torch = torch.as_tensor(
160
+ mask_input, dtype=torch.float, device=self.device
161
+ )
162
+ mask_input_torch = mask_input_torch[None, :, :, :]
163
+
164
+ masks, iou_predictions, low_res_masks = self.predict_torch(
165
+ coords_torch,
166
+ labels_torch,
167
+ box_torch,
168
+ mask_input_torch,
169
+ multimask_output,
170
+ return_logits=return_logits,
171
+ )
172
+
173
+ masks = masks[0].detach().cpu().numpy()
174
+ iou_predictions = iou_predictions[0].detach().cpu().numpy()
175
+ low_res_masks = low_res_masks[0].detach().cpu().numpy()
176
+ return masks, iou_predictions, low_res_masks
177
+
178
+ @torch.no_grad()
179
+ def predict_torch(
180
+ self,
181
+ point_coords: Optional[torch.Tensor],
182
+ point_labels: Optional[torch.Tensor],
183
+ boxes: Optional[torch.Tensor] = None,
184
+ mask_input: Optional[torch.Tensor] = None,
185
+ multimask_output: bool = True,
186
+ return_logits: bool = False,
187
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
188
+ """
189
+ Predict masks for the given input prompts, using the currently set image.
190
+ Input prompts are batched torch tensors and are expected to already be
191
+ transformed to the input frame using ResizeLongestSide.
192
+
193
+ Arguments:
194
+ point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
195
+ model. Each point is in (X,Y) in pixels.
196
+ point_labels (torch.Tensor or None): A BxN array of labels for the
197
+ point prompts. 1 indicates a foreground point and 0 indicates a
198
+ background point.
199
+ box (np.ndarray or None): A Bx4 array given a box prompt to the
200
+ model, in XYXY format.
201
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
202
+ coming from a previous prediction iteration. Has form Bx1xHxW, where
203
+ for SAM, H=W=256. Masks returned by a previous iteration of the
204
+ predict method do not need further transformation.
205
+ multimask_output (bool): If true, the model will return three masks.
206
+ For ambiguous input prompts (such as a single click), this will often
207
+ produce better masks than a single prediction. If only a single
208
+ mask is needed, the model's predicted quality score can be used
209
+ to select the best mask. For non-ambiguous prompts, such as multiple
210
+ input prompts, multimask_output=False can give better results.
211
+ return_logits (bool): If true, returns un-thresholded masks logits
212
+ instead of a binary mask.
213
+
214
+ Returns:
215
+ (torch.Tensor): The output masks in BxCxHxW format, where C is the
216
+ number of masks, and (H, W) is the original image size.
217
+ (torch.Tensor): An array of shape BxC containing the model's
218
+ predictions for the quality of each mask.
219
+ (torch.Tensor): An array of shape BxCxHxW, where C is the number
220
+ of masks and H=W=256. These low res logits can be passed to
221
+ a subsequent iteration as mask input.
222
+ """
223
+ if not self.is_image_set:
224
+ raise RuntimeError(
225
+ "An image must be set with .set_image(...) before mask prediction."
226
+ )
227
+
228
+ if point_coords is not None:
229
+ points = (point_coords, point_labels)
230
+ else:
231
+ points = None
232
+
233
+ # Embed prompts
234
+ sparse_embeddings, dense_embeddings = self.model.prompt_encoder(
235
+ points=points,
236
+ boxes=boxes,
237
+ masks=mask_input,
238
+ )
239
+
240
+ # Predict masks
241
+ low_res_masks, iou_predictions = self.model.mask_decoder(
242
+ image_embeddings=self.features,
243
+ image_pe=self.model.prompt_encoder.get_dense_pe(),
244
+ sparse_prompt_embeddings=sparse_embeddings,
245
+ dense_prompt_embeddings=dense_embeddings,
246
+ multimask_output=multimask_output,
247
+ )
248
+
249
+ # Upscale the masks to the original image resolution
250
+ masks = self.model.postprocess_masks(
251
+ low_res_masks, self.input_size, self.original_size
252
+ )
253
+
254
+ if not return_logits:
255
+ masks = masks > self.model.mask_threshold
256
+
257
+ return masks, iou_predictions, low_res_masks
258
+
259
+ def get_image_embedding(self) -> torch.Tensor:
260
+ """
261
+ Returns the image embeddings for the currently set image, with
262
+ shape 1xCxHxW, where C is the embedding dimension and (H,W) are
263
+ the embedding spatial dimension of SAM (typically C=256, H=W=64).
264
+ """
265
+ if not self.is_image_set:
266
+ raise RuntimeError(
267
+ "An image must be set with .set_image(...) to generate an embedding."
268
+ )
269
+ assert (
270
+ self.features is not None
271
+ ), "Features must exist if an image has been set."
272
+ return self.features
273
+
274
+ @property
275
+ def device(self) -> torch.device:
276
+ return self.model.device
277
+
278
+ def reset_image(self) -> None:
279
+ """Resets the currently set image."""
280
+ self.is_image_set = False
281
+ self.features = None
282
+ self.orig_h = None
283
+ self.orig_w = None
284
+ self.input_h = None
285
+ self.input_w = None
iopaint/runtime.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/huggingface/huggingface_hub/blob/5a12851f54bf614be39614034ed3a9031922d297/src/huggingface_hub/utils/_runtime.py
2
+ import os
3
+ import platform
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import packaging.version
8
+ from iopaint.schema import Device
9
+ from loguru import logger
10
+ from rich import print
11
+ from typing import Dict, Any
12
+
13
+
14
+ _PY_VERSION: str = sys.version.split()[0].rstrip("+")
15
+
16
+ if packaging.version.Version(_PY_VERSION) < packaging.version.Version("3.8.0"):
17
+ import importlib_metadata # type: ignore
18
+ else:
19
+ import importlib.metadata as importlib_metadata # type: ignore
20
+
21
+ _package_versions = {}
22
+
23
+ _CANDIDATES = [
24
+ "torch",
25
+ "torchvision",
26
+ "Pillow",
27
+ "diffusers",
28
+ "transformers",
29
+ "opencv-python",
30
+ "accelerate",
31
+ "iopaint",
32
+ "rembg",
33
+ "realesrgan",
34
+ "gfpgan",
35
+ ]
36
+ # Check once at runtime
37
+ for name in _CANDIDATES:
38
+ _package_versions[name] = "N/A"
39
+ try:
40
+ _package_versions[name] = importlib_metadata.version(name)
41
+ except importlib_metadata.PackageNotFoundError:
42
+ pass
43
+
44
+
45
+ def dump_environment_info() -> Dict[str, str]:
46
+ """Dump information about the machine to help debugging issues."""
47
+
48
+ # Generic machine info
49
+ info: Dict[str, Any] = {
50
+ "Platform": platform.platform(),
51
+ "Python version": platform.python_version(),
52
+ }
53
+ info.update(_package_versions)
54
+ print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]) + "\n")
55
+ return info
56
+
57
+
58
+ def check_device(device: Device) -> Device:
59
+ if device == Device.cuda:
60
+ import platform
61
+
62
+ if platform.system() == "Darwin":
63
+ logger.warning("MacOS does not support cuda, use cpu instead")
64
+ return Device.cpu
65
+ else:
66
+ import torch
67
+
68
+ if not torch.cuda.is_available():
69
+ logger.warning("CUDA is not available, use cpu instead")
70
+ return Device.cpu
71
+ elif device == Device.mps:
72
+ import torch
73
+
74
+ if not torch.backends.mps.is_available():
75
+ logger.warning("mps is not available, use cpu instead")
76
+ return Device.cpu
77
+ return device
78
+
79
+
80
+ def setup_model_dir(model_dir: Path):
81
+ model_dir = model_dir.expanduser().absolute()
82
+ logger.info(f"Model directory: {model_dir}")
83
+ os.environ["U2NET_HOME"] = str(model_dir)
84
+ os.environ["XDG_CACHE_HOME"] = str(model_dir)
85
+ if not model_dir.exists():
86
+ logger.info(f"Create model directory: {model_dir}")
87
+ model_dir.mkdir(exist_ok=True, parents=True)
88
+ return model_dir
iopaint/schema.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import Optional, Literal, List
5
+
6
+ from iopaint.const import (
7
+ INSTRUCT_PIX2PIX_NAME,
8
+ KANDINSKY22_NAME,
9
+ POWERPAINT_NAME,
10
+ ANYTEXT_NAME,
11
+ SDXL_CONTROLNET_CHOICES,
12
+ SD2_CONTROLNET_CHOICES,
13
+ SD_CONTROLNET_CHOICES,
14
+ )
15
+ from loguru import logger
16
+ from pydantic import BaseModel, Field, field_validator, computed_field
17
+
18
+
19
+ class ModelType(str, Enum):
20
+ INPAINT = "inpaint" # LaMa, MAT...
21
+ DIFFUSERS_SD = "diffusers_sd"
22
+ DIFFUSERS_SD_INPAINT = "diffusers_sd_inpaint"
23
+ DIFFUSERS_SDXL = "diffusers_sdxl"
24
+ DIFFUSERS_SDXL_INPAINT = "diffusers_sdxl_inpaint"
25
+ DIFFUSERS_OTHER = "diffusers_other"
26
+
27
+
28
+ class ModelInfo(BaseModel):
29
+ name: str
30
+ path: str
31
+ model_type: ModelType
32
+ is_single_file_diffusers: bool = False
33
+
34
+ @computed_field
35
+ @property
36
+ def need_prompt(self) -> bool:
37
+ return self.model_type in [
38
+ ModelType.DIFFUSERS_SD,
39
+ ModelType.DIFFUSERS_SDXL,
40
+ ModelType.DIFFUSERS_SD_INPAINT,
41
+ ModelType.DIFFUSERS_SDXL_INPAINT,
42
+ ] or self.name in [
43
+ INSTRUCT_PIX2PIX_NAME,
44
+ KANDINSKY22_NAME,
45
+ POWERPAINT_NAME,
46
+ ANYTEXT_NAME,
47
+ ]
48
+
49
+ @computed_field
50
+ @property
51
+ def controlnets(self) -> List[str]:
52
+ if self.model_type in [
53
+ ModelType.DIFFUSERS_SDXL,
54
+ ModelType.DIFFUSERS_SDXL_INPAINT,
55
+ ]:
56
+ return SDXL_CONTROLNET_CHOICES
57
+ if self.model_type in [ModelType.DIFFUSERS_SD, ModelType.DIFFUSERS_SD_INPAINT]:
58
+ if "sd2" in self.name.lower():
59
+ return SD2_CONTROLNET_CHOICES
60
+ else:
61
+ return SD_CONTROLNET_CHOICES
62
+ if self.name == POWERPAINT_NAME:
63
+ return SD_CONTROLNET_CHOICES
64
+ return []
65
+
66
+ @computed_field
67
+ @property
68
+ def support_strength(self) -> bool:
69
+ return self.model_type in [
70
+ ModelType.DIFFUSERS_SD,
71
+ ModelType.DIFFUSERS_SDXL,
72
+ ModelType.DIFFUSERS_SD_INPAINT,
73
+ ModelType.DIFFUSERS_SDXL_INPAINT,
74
+ ] or self.name in [POWERPAINT_NAME, ANYTEXT_NAME]
75
+
76
+ @computed_field
77
+ @property
78
+ def support_outpainting(self) -> bool:
79
+ return self.model_type in [
80
+ ModelType.DIFFUSERS_SD,
81
+ ModelType.DIFFUSERS_SDXL,
82
+ ModelType.DIFFUSERS_SD_INPAINT,
83
+ ModelType.DIFFUSERS_SDXL_INPAINT,
84
+ ] or self.name in [KANDINSKY22_NAME, POWERPAINT_NAME]
85
+
86
+ @computed_field
87
+ @property
88
+ def support_lcm_lora(self) -> bool:
89
+ return self.model_type in [
90
+ ModelType.DIFFUSERS_SD,
91
+ ModelType.DIFFUSERS_SDXL,
92
+ ModelType.DIFFUSERS_SD_INPAINT,
93
+ ModelType.DIFFUSERS_SDXL_INPAINT,
94
+ ]
95
+
96
+ @computed_field
97
+ @property
98
+ def support_controlnet(self) -> bool:
99
+ return self.model_type in [
100
+ ModelType.DIFFUSERS_SD,
101
+ ModelType.DIFFUSERS_SDXL,
102
+ ModelType.DIFFUSERS_SD_INPAINT,
103
+ ModelType.DIFFUSERS_SDXL_INPAINT,
104
+ ]
105
+
106
+ @computed_field
107
+ @property
108
+ def support_freeu(self) -> bool:
109
+ return self.model_type in [
110
+ ModelType.DIFFUSERS_SD,
111
+ ModelType.DIFFUSERS_SDXL,
112
+ ModelType.DIFFUSERS_SD_INPAINT,
113
+ ModelType.DIFFUSERS_SDXL_INPAINT,
114
+ ] or self.name in [INSTRUCT_PIX2PIX_NAME]
115
+
116
+
117
+ class Choices(str, Enum):
118
+ @classmethod
119
+ def values(cls):
120
+ return [member.value for member in cls]
121
+
122
+
123
+ class RealESRGANModel(Choices):
124
+ realesr_general_x4v3 = "realesr-general-x4v3"
125
+ RealESRGAN_x4plus = "RealESRGAN_x4plus"
126
+ RealESRGAN_x4plus_anime_6B = "RealESRGAN_x4plus_anime_6B"
127
+
128
+
129
+ class RemoveBGModel(Choices):
130
+ u2net = "u2net"
131
+ u2netp = "u2netp"
132
+ u2net_human_seg = "u2net_human_seg"
133
+ u2net_cloth_seg = "u2net_cloth_seg"
134
+ silueta = "silueta"
135
+ isnet_general_use = "isnet-general-use"
136
+ briaai_rmbg_1_4 = "briaai/RMBG-1.4"
137
+
138
+
139
+ class Device(Choices):
140
+ cpu = "cpu"
141
+ cuda = "cuda"
142
+ mps = "mps"
143
+
144
+
145
+ class InteractiveSegModel(Choices):
146
+ vit_b = "vit_b"
147
+ vit_l = "vit_l"
148
+ vit_h = "vit_h"
149
+ mobile_sam = "mobile_sam"
150
+
151
+
152
+ class PluginInfo(BaseModel):
153
+ name: str
154
+ support_gen_image: bool = False
155
+ support_gen_mask: bool = False
156
+
157
+
158
+ class CV2Flag(str, Enum):
159
+ INPAINT_NS = "INPAINT_NS"
160
+ INPAINT_TELEA = "INPAINT_TELEA"
161
+
162
+
163
+ class HDStrategy(str, Enum):
164
+ # Use original image size
165
+ ORIGINAL = "Original"
166
+ # Resize the longer side of the image to a specific size(hd_strategy_resize_limit),
167
+ # then do inpainting on the resized image. Finally, resize the inpainting result to the original size.
168
+ # The area outside the mask will not lose quality.
169
+ RESIZE = "Resize"
170
+ # Crop masking area(with a margin controlled by hd_strategy_crop_margin) from the original image to do inpainting
171
+ CROP = "Crop"
172
+
173
+
174
+ class LDMSampler(str, Enum):
175
+ ddim = "ddim"
176
+ plms = "plms"
177
+
178
+
179
+ class SDSampler(str, Enum):
180
+ dpm_plus_plus_2m = "DPM++ 2M"
181
+ dpm_plus_plus_2m_karras = "DPM++ 2M Karras"
182
+ dpm_plus_plus_2m_sde = "DPM++ 2M SDE"
183
+ dpm_plus_plus_2m_sde_karras = "DPM++ 2M SDE Karras"
184
+ dpm_plus_plus_sde = "DPM++ SDE"
185
+ dpm_plus_plus_sde_karras = "DPM++ SDE Karras"
186
+ dpm2 = "DPM2"
187
+ dpm2_karras = "DPM2 Karras"
188
+ dpm2_a = "DPM2 a"
189
+ dpm2_a_karras = "DPM2 a Karras"
190
+ euler = "Euler"
191
+ euler_a = "Euler a"
192
+ heun = "Heun"
193
+ lms = "LMS"
194
+ lms_karras = "LMS Karras"
195
+
196
+ ddim = "DDIM"
197
+ pndm = "PNDM"
198
+ uni_pc = "UniPC"
199
+ lcm = "LCM"
200
+
201
+
202
+ class FREEUConfig(BaseModel):
203
+ s1: float = 0.9
204
+ s2: float = 0.2
205
+ b1: float = 1.2
206
+ b2: float = 1.4
207
+
208
+
209
+ class PowerPaintTask(str, Enum):
210
+ text_guided = "text-guided"
211
+ shape_guided = "shape-guided"
212
+ object_remove = "object-remove"
213
+ outpainting = "outpainting"
214
+
215
+
216
+ class ApiConfig(BaseModel):
217
+ host: str
218
+ port: int
219
+ inbrowser: bool
220
+ model: str
221
+ no_half: bool
222
+ low_mem: bool
223
+ cpu_offload: bool
224
+ disable_nsfw_checker: bool
225
+ local_files_only: bool
226
+ cpu_textencoder: bool
227
+ device: Device
228
+ input: Optional[Path]
229
+ output_dir: Optional[Path]
230
+ quality: int
231
+ enable_interactive_seg: bool
232
+ interactive_seg_model: InteractiveSegModel
233
+ interactive_seg_device: Device
234
+ enable_remove_bg: bool
235
+ remove_bg_model: str
236
+ enable_anime_seg: bool
237
+ enable_realesrgan: bool
238
+ realesrgan_device: Device
239
+ realesrgan_model: RealESRGANModel
240
+ enable_gfpgan: bool
241
+ gfpgan_device: Device
242
+ enable_restoreformer: bool
243
+ restoreformer_device: Device
244
+
245
+
246
+ class InpaintRequest(BaseModel):
247
+ image: Optional[str] = Field(None, description="base64 encoded image")
248
+ mask: Optional[str] = Field(None, description="base64 encoded mask")
249
+
250
+ ldm_steps: int = Field(20, description="Steps for ldm model.")
251
+ ldm_sampler: str = Field(LDMSampler.plms, discription="Sampler for ldm model.")
252
+ zits_wireframe: bool = Field(True, description="Enable wireframe for zits model.")
253
+
254
+ hd_strategy: str = Field(
255
+ HDStrategy.CROP,
256
+ description="Different way to preprocess image, only used by erase models(e.g. lama/mat)",
257
+ )
258
+ hd_strategy_crop_trigger_size: int = Field(
259
+ 800,
260
+ description="Crop trigger size for hd_strategy=CROP, if the longer side of the image is larger than this value, use crop strategy",
261
+ )
262
+ hd_strategy_crop_margin: int = Field(
263
+ 128, description="Crop margin for hd_strategy=CROP"
264
+ )
265
+ hd_strategy_resize_limit: int = Field(
266
+ 1280, description="Resize limit for hd_strategy=RESIZE"
267
+ )
268
+
269
+ prompt: str = Field("", description="Prompt for diffusion models.")
270
+ negative_prompt: str = Field(
271
+ "", description="Negative prompt for diffusion models."
272
+ )
273
+ use_croper: bool = Field(
274
+ False, description="Crop image before doing diffusion inpainting"
275
+ )
276
+ croper_x: int = Field(0, description="Crop x for croper")
277
+ croper_y: int = Field(0, description="Crop y for croper")
278
+ croper_height: int = Field(512, description="Crop height for croper")
279
+ croper_width: int = Field(512, description="Crop width for croper")
280
+
281
+ use_extender: bool = Field(
282
+ False, description="Extend image before doing sd outpainting"
283
+ )
284
+ extender_x: int = Field(0, description="Extend x for extender")
285
+ extender_y: int = Field(0, description="Extend y for extender")
286
+ extender_height: int = Field(640, description="Extend height for extender")
287
+ extender_width: int = Field(640, description="Extend width for extender")
288
+
289
+ sd_scale: float = Field(
290
+ 1.0,
291
+ description="Resize the image before doing sd inpainting, the area outside the mask will not lose quality.",
292
+ gt=0.0,
293
+ le=1.0,
294
+ )
295
+ sd_mask_blur: int = Field(
296
+ 11,
297
+ description="Blur the edge of mask area. The higher the number the smoother blend with the original image",
298
+ )
299
+ sd_strength: float = Field(
300
+ 1.0,
301
+ description="Strength is a measure of how much noise is added to the base image, which influences how similar the output is to the base image. Higher value means more noise and more different from the base image",
302
+ le=1.0,
303
+ )
304
+ sd_steps: int = Field(
305
+ 50,
306
+ description="The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.",
307
+ )
308
+ sd_guidance_scale: float = Field(
309
+ 7.5,
310
+ help="Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality.",
311
+ )
312
+ sd_sampler: str = Field(
313
+ SDSampler.uni_pc, description="Sampler for diffusion model."
314
+ )
315
+ sd_seed: int = Field(
316
+ 42,
317
+ description="Seed for diffusion model. -1 mean random seed",
318
+ validate_default=True,
319
+ )
320
+ sd_match_histograms: bool = Field(
321
+ False,
322
+ description="Match histograms between inpainting area and original image.",
323
+ )
324
+
325
+ sd_outpainting_softness: float = Field(20.0)
326
+ sd_outpainting_space: float = Field(20.0)
327
+
328
+ sd_freeu: bool = Field(
329
+ False,
330
+ description="Enable freeu mode. https://huggingface.co/docs/diffusers/main/en/using-diffusers/freeu",
331
+ )
332
+ sd_freeu_config: FREEUConfig = FREEUConfig()
333
+
334
+ sd_lcm_lora: bool = Field(
335
+ False,
336
+ description="Enable lcm-lora mode. https://huggingface.co/docs/diffusers/main/en/using-diffusers/inference_with_lcm#texttoimage",
337
+ )
338
+
339
+ sd_keep_unmasked_area: bool = Field(
340
+ True, description="Keep unmasked area unchanged"
341
+ )
342
+
343
+ cv2_flag: CV2Flag = Field(
344
+ CV2Flag.INPAINT_NS,
345
+ description="Flag for opencv inpainting: https://docs.opencv.org/4.6.0/d7/d8b/group__photo__inpaint.html#gga8002a65f5a3328fbf15df81b842d3c3ca05e763003a805e6c11c673a9f4ba7d07",
346
+ )
347
+ cv2_radius: int = Field(
348
+ 4,
349
+ description="Radius of a circular neighborhood of each point inpainted that is considered by the algorithm",
350
+ )
351
+
352
+ # Paint by Example
353
+ paint_by_example_example_image: Optional[str] = Field(
354
+ None, description="Base64 encoded example image for paint by example model"
355
+ )
356
+
357
+ # InstructPix2Pix
358
+ p2p_image_guidance_scale: float = Field(1.5, description="Image guidance scale")
359
+
360
+ # ControlNet
361
+ enable_controlnet: bool = Field(False, description="Enable controlnet")
362
+ controlnet_conditioning_scale: float = Field(
363
+ 0.4, description="Conditioning scale", ge=0.0, le=1.0
364
+ )
365
+ controlnet_method: str = Field(
366
+ "lllyasviel/control_v11p_sd15_canny", description="Controlnet method"
367
+ )
368
+
369
+ # PowerPaint
370
+ powerpaint_task: PowerPaintTask = Field(
371
+ PowerPaintTask.text_guided, description="PowerPaint task"
372
+ )
373
+ fitting_degree: float = Field(
374
+ 1.0,
375
+ description="Control the fitting degree of the generated objects to the mask shape.",
376
+ gt=0.0,
377
+ le=1.0,
378
+ )
379
+
380
+ @field_validator("sd_seed")
381
+ @classmethod
382
+ def sd_seed_validator(cls, v: int) -> int:
383
+ if v == -1:
384
+ return random.randint(1, 99999999)
385
+ return v
386
+
387
+ @field_validator("controlnet_conditioning_scale")
388
+ @classmethod
389
+ def validate_field(cls, v: float, values):
390
+ use_extender = values.data["use_extender"]
391
+ enable_controlnet = values.data["enable_controlnet"]
392
+ if use_extender and enable_controlnet:
393
+ logger.info(f"Extender is enabled, set controlnet_conditioning_scale=0")
394
+ return 0
395
+ return v
396
+
397
+
398
+ class RunPluginRequest(BaseModel):
399
+ name: str
400
+ image: str = Field(..., description="base64 encoded image")
401
+ clicks: List[List[int]] = Field(
402
+ [], description="Clicks for interactive seg, [[x,y,0/1], [x2,y2,0/1]]"
403
+ )
404
+ scale: float = Field(2.0, description="Scale for upscaling")
405
+
406
+
407
+ MediaTab = Literal["input", "output"]
408
+
409
+
410
+ class MediasResponse(BaseModel):
411
+ name: str
412
+ height: int
413
+ width: int
414
+ ctime: float
415
+ mtime: float
416
+
417
+
418
+ class GenInfoResponse(BaseModel):
419
+ prompt: str = ""
420
+ negative_prompt: str = ""
421
+
422
+
423
+ class ServerConfigResponse(BaseModel):
424
+ plugins: List[PluginInfo]
425
+ modelInfos: List[ModelInfo]
426
+ removeBGModel: RemoveBGModel
427
+ removeBGModels: List[RemoveBGModel]
428
+ realesrganModel: RealESRGANModel
429
+ realesrganModels: List[RealESRGANModel]
430
+ interactiveSegModel: InteractiveSegModel
431
+ interactiveSegModels: List[InteractiveSegModel]
432
+ enableFileManager: bool
433
+ enableAutoSaving: bool
434
+ enableControlnet: bool
435
+ controlnetMethod: Optional[str]
436
+ disableModelSwitch: bool
437
+ isDesktop: bool
438
+ samplers: List[str]
439
+
440
+
441
+ class SwitchModelRequest(BaseModel):
442
+ name: str
443
+
444
+
445
+ class SwitchPluginModelRequest(BaseModel):
446
+ plugin_name: str
447
+ model_name: str
448
+
449
+
450
+ AdjustMaskOperate = Literal["expand", "shrink", "reverse"]
451
+
452
+
453
+ class AdjustMaskRequest(BaseModel):
454
+ mask: str = Field(
455
+ ..., description="base64 encoded mask. 255 means area to do inpaint"
456
+ )
457
+ operate: AdjustMaskOperate = Field(..., description="expand/shrink/reverse")
458
+ kernel_size: int = Field(5, description="Kernel size for expanding mask")
iopaint/single_processing.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Dict, Optional
6
+
7
+ import cv2
8
+ import psutil
9
+ from PIL import Image
10
+ from loguru import logger
11
+ from rich.console import Console
12
+ from rich.progress import (
13
+ Progress,
14
+ SpinnerColumn,
15
+ TimeElapsedColumn,
16
+ MofNCompleteColumn,
17
+ TextColumn,
18
+ BarColumn,
19
+ TaskProgressColumn,
20
+ )
21
+
22
+ from iopaint.helper import pil_to_bytes_single
23
+ from iopaint.model.utils import torch_gc
24
+ from iopaint.model_manager import ModelManager
25
+ from iopaint.schema import InpaintRequest
26
+ import numpy as np
27
+
28
+
29
+ def glob_images(path: Path) -> Dict[str, Path]:
30
+ # png/jpg/jpeg
31
+ if path.is_file():
32
+ return {path.stem: path}
33
+ elif path.is_dir():
34
+ res = {}
35
+ for it in path.glob("*.*"):
36
+ if it.suffix.lower() in [".png", ".jpg", ".jpeg"]:
37
+ res[it.stem] = it
38
+ return res
39
+
40
+
41
+ # def batch_inpaint(
42
+ # model: str,
43
+ # device,
44
+ # image: Path,
45
+ # mask: Path,
46
+ # config: Optional[Path] = None,
47
+ # concat: bool = False,
48
+ # ):
49
+ # if config is None:
50
+ # inpaint_request = InpaintRequest()
51
+ # else:
52
+ # with open(config, "r", encoding="utf-8") as f:
53
+ # inpaint_request = InpaintRequest(**json.load(f))
54
+ #
55
+ # model_manager = ModelManager(name=model, device=device)
56
+ #
57
+ # img = cv2.imread(str(image))
58
+ # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
59
+ #
60
+ # mask_img = cv2.imread(str(mask), cv2.IMREAD_GRAYSCALE)
61
+ #
62
+ # if mask_img.shape[:2] != img.shape[:2]:
63
+ # mask_img = cv2.resize(
64
+ # mask_img,
65
+ # (img.shape[1], img.shape[0]),
66
+ # interpolation=cv2.INTER_NEAREST,
67
+ # )
68
+ #
69
+ # mask_img[mask_img >= 127] = 255
70
+ # mask_img[mask_img < 127] = 0
71
+ #
72
+ # # bgr
73
+ # inpaint_result = model_manager(img, mask_img, inpaint_request)
74
+ # inpaint_result = cv2.cvtColor(inpaint_result, cv2.COLOR_BGR2RGB)
75
+ #
76
+ # if concat:
77
+ # mask_img = cv2.cvtColor(mask_img, cv2.COLOR_GRAY2RGB)
78
+ # inpaint_result = cv2.hconcat([img, mask_img, inpaint_result])
79
+ #
80
+ # # Convert the NumPy array to PIL Image
81
+ # pil_image = Image.fromarray(inpaint_result)
82
+ #
83
+ # # Encode the PIL Image as base64 string
84
+ # with io.BytesIO() as output_buffer:
85
+ # pil_image.save(output_buffer, format='PNG')
86
+ # base64_image = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
87
+ #
88
+ # return base64_image
89
+
90
+ def batch_inpaint(
91
+ model: str,
92
+ device,
93
+ input_base64: str,
94
+ mask_base64: str,
95
+ config_base64: Optional[str] = None,
96
+ concat: bool = False,
97
+ ):
98
+ if config_base64 is None:
99
+ inpaint_request = InpaintRequest()
100
+ else:
101
+ config_json = base64.b64decode(config_base64)
102
+ inpaint_request = InpaintRequest(**json.loads(config_json))
103
+
104
+ model_manager = ModelManager(name=model, device=device)
105
+
106
+ # Decode input image from base64
107
+ input_image_data = base64.b64decode(input_base64)
108
+ input_image = cv2.imdecode(np.frombuffer(input_image_data, np.uint8), cv2.IMREAD_COLOR)
109
+
110
+ # Decode mask image from base64
111
+ mask_image_data = base64.b64decode(mask_base64)
112
+ mask_image = cv2.imdecode(np.frombuffer(mask_image_data, np.uint8), cv2.IMREAD_GRAYSCALE)
113
+
114
+ if mask_image.shape[:2] != input_image.shape[:2]:
115
+ mask_image = cv2.resize(
116
+ mask_image,
117
+ (input_image.shape[1], input_image.shape[0]),
118
+ interpolation=cv2.INTER_NEAREST,
119
+ )
120
+
121
+ mask_image[mask_image >= 127] = 255
122
+ mask_image[mask_image < 127] = 0
123
+
124
+ # Run inpainting
125
+ inpaint_result = model_manager(input_image, mask_image, inpaint_request)
126
+
127
+ if concat:
128
+ mask_image = cv2.cvtColor(mask_image, cv2.COLOR_GRAY2RGB)
129
+ inpaint_result = cv2.hconcat([input_image, mask_image, inpaint_result])
130
+
131
+ # Convert NumPy array to PIL Image
132
+ pil_image = Image.fromarray(inpaint_result)
133
+
134
+ # Encode PIL Image to base64 string
135
+ with io.BytesIO() as output_buffer:
136
+ pil_image.save(output_buffer, format='PNG')
137
+ base64_image = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
138
+
139
+ return base64_image
140
+
141
+
142
+ def batch_inpaint_cv2(
143
+ model: str,
144
+ device,
145
+ input_base: str,
146
+ mask_base: str,
147
+ config_base64: Optional[str] = None,
148
+ concat: bool = False,
149
+ ):
150
+ if config_base64 is None:
151
+ inpaint_request = InpaintRequest()
152
+ else:
153
+ config_json = base64.b64decode(config_base64)
154
+ inpaint_request = InpaintRequest(**json.loads(config_json))
155
+
156
+ model_manager = ModelManager(name=model, device=device)
157
+
158
+ # Decode input image from base
159
+ input_image = input_base
160
+ # Decode mask image from base
161
+ mask_image = mask_base
162
+
163
+ if mask_image.shape[:2] != input_image.shape[:2]:
164
+ mask_image = cv2.resize(
165
+ mask_image,
166
+ (input_image.shape[1], input_image.shape[0]),
167
+ interpolation=cv2.INTER_NEAREST,
168
+ )
169
+
170
+ mask_image[mask_image >= 127] = 255
171
+ mask_image[mask_image < 127] = 0
172
+
173
+ # Run inpainting
174
+ inpaint_result = model_manager(input_image, mask_image, inpaint_request)
175
+
176
+ if concat:
177
+ mask_image = cv2.cvtColor(mask_image, cv2.COLOR_GRAY2RGB)
178
+ inpaint_result = cv2.hconcat([input_image, mask_image, inpaint_result])
179
+
180
+ return inpaint_result
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ opencv-python
3
+ diffusers
4
+ accelerate
5
+ peft==0.7.1
6
+ transformers==4.37.2
7
+ safetensors
8
+ controlnet-aux==0.0.3
9
+ fastapi==0.108.0
10
+ uvicorn
11
+ python-multipart
12
+ python-socketio==5.7.2
13
+ typer
14
+ pydantic>=2.5.2
15
+ rich
16
+ loguru
17
+ yacs
18
+ piexif==1.1.3
19
+ omegaconf
20
+ easydict
21
+ gradio
22
+ typer-config==1.4.0
23
+ Pillow==9.5.0
24
+ ultralytics
25
+ flask
26
+ flask_cors