diff --git a/.gitattributes b/.gitattributes index cef6514019122c8978d0e50384624abec00458a8..9072bac14015f13355e68eea3f0ad0b010303f10 100644 --- a/.gitattributes +++ b/.gitattributes @@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text extensions/Stable-Diffusion-Webui-Civitai-Helper/img/all_in_one.png filter=lfs diff=lfs merge=lfs -text extensions/addtional/models/lora/README.md filter=lfs diff=lfs merge=lfs -text repositories/BLIP/BLIP.gif filter=lfs diff=lfs merge=lfs -text +repositories/generative-models/assets/sdxl_report.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/repositories/CodeFormer/facelib/utils/__pycache__/misc.cpython-310.pyc b/repositories/CodeFormer/facelib/utils/__pycache__/misc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc38051ec8987442bb962bd45420e1d34fef0451 Binary files /dev/null and b/repositories/CodeFormer/facelib/utils/__pycache__/misc.cpython-310.pyc differ diff --git a/repositories/CodeFormer/facelib/utils/face_restoration_helper.py b/repositories/CodeFormer/facelib/utils/face_restoration_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..6b7644ccd3d9978aea7997a76f7c6fdb0ccde8b1 --- /dev/null +++ b/repositories/CodeFormer/facelib/utils/face_restoration_helper.py @@ -0,0 +1,455 @@ +import cv2 +import numpy as np +import os +import torch +from torchvision.transforms.functional import normalize + +from facelib.detection import init_detection_model +from facelib.parsing import init_parsing_model +from facelib.utils.misc import img2tensor, imwrite + + +def get_largest_face(det_faces, h, w): + + def get_location(val, length): + if val < 0: + return 0 + elif val > length: + return length + else: + return val + + face_areas = [] + for det_face in det_faces: + left = get_location(det_face[0], w) + right = get_location(det_face[2], w) + top = get_location(det_face[1], h) + bottom = get_location(det_face[3], h) + face_area = (right - left) * (bottom - top) + face_areas.append(face_area) + largest_idx = face_areas.index(max(face_areas)) + return det_faces[largest_idx], largest_idx + + +def get_center_face(det_faces, h=0, w=0, center=None): + if center is not None: + center = np.array(center) + else: + center = np.array([w / 2, h / 2]) + center_dist = [] + for det_face in det_faces: + face_center = np.array([(det_face[0] + det_face[2]) / 2, (det_face[1] + det_face[3]) / 2]) + dist = np.linalg.norm(face_center - center) + center_dist.append(dist) + center_idx = center_dist.index(min(center_dist)) + return det_faces[center_idx], center_idx + + +class FaceRestoreHelper(object): + """Helper for the face restoration pipeline (base class).""" + + def __init__(self, + upscale_factor, + face_size=512, + crop_ratio=(1, 1), + det_model='retinaface_resnet50', + save_ext='png', + template_3points=False, + pad_blur=False, + use_parse=False, + device=None): + self.template_3points = template_3points # improve robustness + self.upscale_factor = upscale_factor + # the cropped face ratio based on the square face + self.crop_ratio = crop_ratio # (h, w) + assert (self.crop_ratio[0] >= 1 and self.crop_ratio[1] >= 1), 'crop ration only supports >=1' + self.face_size = (int(face_size * self.crop_ratio[1]), int(face_size * self.crop_ratio[0])) + + if self.template_3points: + self.face_template = np.array([[192, 240], [319, 240], [257, 371]]) + else: + # standard 5 landmarks for FFHQ faces with 512 x 512 + # facexlib + self.face_template = np.array([[192.98138, 239.94708], [318.90277, 240.1936], [256.63416, 314.01935], + [201.26117, 371.41043], [313.08905, 371.15118]]) + + # dlib: left_eye: 36:41 right_eye: 42:47 nose: 30,32,33,34 left mouth corner: 48 right mouth corner: 54 + # self.face_template = np.array([[193.65928, 242.98541], [318.32558, 243.06108], [255.67984, 328.82894], + # [198.22603, 372.82502], [313.91018, 372.75659]]) + + + self.face_template = self.face_template * (face_size / 512.0) + if self.crop_ratio[0] > 1: + self.face_template[:, 1] += face_size * (self.crop_ratio[0] - 1) / 2 + if self.crop_ratio[1] > 1: + self.face_template[:, 0] += face_size * (self.crop_ratio[1] - 1) / 2 + self.save_ext = save_ext + self.pad_blur = pad_blur + if self.pad_blur is True: + self.template_3points = False + + self.all_landmarks_5 = [] + self.det_faces = [] + self.affine_matrices = [] + self.inverse_affine_matrices = [] + self.cropped_faces = [] + self.restored_faces = [] + self.pad_input_imgs = [] + + if device is None: + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + else: + self.device = device + + # init face detection model + self.face_det = init_detection_model(det_model, half=False, device=self.device) + + # init face parsing model + self.use_parse = use_parse + self.face_parse = init_parsing_model(model_name='parsenet', device=self.device) + + def set_upscale_factor(self, upscale_factor): + self.upscale_factor = upscale_factor + + def read_image(self, img): + """img can be image path or cv2 loaded image.""" + # self.input_img is Numpy array, (h, w, c), BGR, uint8, [0, 255] + if isinstance(img, str): + img = cv2.imread(img) + + if np.max(img) > 256: # 16-bit image + img = img / 65535 * 255 + if len(img.shape) == 2: # gray image + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif img.shape[2] == 4: # BGRA image with alpha channel + img = img[:, :, 0:3] + + self.input_img = img + + if min(self.input_img.shape[:2])<512: + f = 512.0/min(self.input_img.shape[:2]) + self.input_img = cv2.resize(self.input_img, (0,0), fx=f, fy=f, interpolation=cv2.INTER_LINEAR) + + def get_face_landmarks_5(self, + only_keep_largest=False, + only_center_face=False, + resize=None, + blur_ratio=0.01, + eye_dist_threshold=None): + if resize is None: + scale = 1 + input_img = self.input_img + else: + h, w = self.input_img.shape[0:2] + scale = resize / min(h, w) + scale = max(1, scale) # always scale up + h, w = int(h * scale), int(w * scale) + interp = cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR + input_img = cv2.resize(self.input_img, (w, h), interpolation=interp) + + with torch.no_grad(): + bboxes = self.face_det.detect_faces(input_img) + + if bboxes is None or bboxes.shape[0] == 0: + return 0 + else: + bboxes = bboxes / scale + + for bbox in bboxes: + # remove faces with too small eye distance: side faces or too small faces + eye_dist = np.linalg.norm([bbox[6] - bbox[8], bbox[7] - bbox[9]]) + if eye_dist_threshold is not None and (eye_dist < eye_dist_threshold): + continue + + if self.template_3points: + landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 11, 2)]) + else: + landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 15, 2)]) + self.all_landmarks_5.append(landmark) + self.det_faces.append(bbox[0:5]) + + if len(self.det_faces) == 0: + return 0 + if only_keep_largest: + h, w, _ = self.input_img.shape + self.det_faces, largest_idx = get_largest_face(self.det_faces, h, w) + self.all_landmarks_5 = [self.all_landmarks_5[largest_idx]] + elif only_center_face: + h, w, _ = self.input_img.shape + self.det_faces, center_idx = get_center_face(self.det_faces, h, w) + self.all_landmarks_5 = [self.all_landmarks_5[center_idx]] + + # pad blurry images + if self.pad_blur: + self.pad_input_imgs = [] + for landmarks in self.all_landmarks_5: + # get landmarks + eye_left = landmarks[0, :] + eye_right = landmarks[1, :] + eye_avg = (eye_left + eye_right) * 0.5 + mouth_avg = (landmarks[3, :] + landmarks[4, :]) * 0.5 + eye_to_eye = eye_right - eye_left + eye_to_mouth = mouth_avg - eye_avg + + # Get the oriented crop rectangle + # x: half width of the oriented crop rectangle + x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] + # - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise + # norm with the hypotenuse: get the direction + x /= np.hypot(*x) # get the hypotenuse of a right triangle + rect_scale = 1.5 + x *= max(np.hypot(*eye_to_eye) * 2.0 * rect_scale, np.hypot(*eye_to_mouth) * 1.8 * rect_scale) + # y: half height of the oriented crop rectangle + y = np.flipud(x) * [-1, 1] + + # c: center + c = eye_avg + eye_to_mouth * 0.1 + # quad: (left_top, left_bottom, right_bottom, right_top) + quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) + # qsize: side length of the square + qsize = np.hypot(*x) * 2 + border = max(int(np.rint(qsize * 0.1)), 3) + + # get pad + # pad: (width_left, height_top, width_right, height_bottom) + pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + pad = [ + max(-pad[0] + border, 1), + max(-pad[1] + border, 1), + max(pad[2] - self.input_img.shape[0] + border, 1), + max(pad[3] - self.input_img.shape[1] + border, 1) + ] + + if max(pad) > 1: + # pad image + pad_img = np.pad(self.input_img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect') + # modify landmark coords + landmarks[:, 0] += pad[0] + landmarks[:, 1] += pad[1] + # blur pad images + h, w, _ = pad_img.shape + y, x, _ = np.ogrid[:h, :w, :1] + mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], + np.float32(w - 1 - x) / pad[2]), + 1.0 - np.minimum(np.float32(y) / pad[1], + np.float32(h - 1 - y) / pad[3])) + blur = int(qsize * blur_ratio) + if blur % 2 == 0: + blur += 1 + blur_img = cv2.boxFilter(pad_img, 0, ksize=(blur, blur)) + # blur_img = cv2.GaussianBlur(pad_img, (blur, blur), 0) + + pad_img = pad_img.astype('float32') + pad_img += (blur_img - pad_img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) + pad_img += (np.median(pad_img, axis=(0, 1)) - pad_img) * np.clip(mask, 0.0, 1.0) + pad_img = np.clip(pad_img, 0, 255) # float32, [0, 255] + self.pad_input_imgs.append(pad_img) + else: + self.pad_input_imgs.append(np.copy(self.input_img)) + + return len(self.all_landmarks_5) + + def align_warp_face(self, save_cropped_path=None, border_mode='constant'): + """Align and warp faces with face template. + """ + if self.pad_blur: + assert len(self.pad_input_imgs) == len( + self.all_landmarks_5), f'Mismatched samples: {len(self.pad_input_imgs)} and {len(self.all_landmarks_5)}' + for idx, landmark in enumerate(self.all_landmarks_5): + # use 5 landmarks to get affine matrix + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D(landmark, self.face_template, method=cv2.LMEDS)[0] + self.affine_matrices.append(affine_matrix) + # warp and crop faces + if border_mode == 'constant': + border_mode = cv2.BORDER_CONSTANT + elif border_mode == 'reflect101': + border_mode = cv2.BORDER_REFLECT101 + elif border_mode == 'reflect': + border_mode = cv2.BORDER_REFLECT + if self.pad_blur: + input_img = self.pad_input_imgs[idx] + else: + input_img = self.input_img + cropped_face = cv2.warpAffine( + input_img, affine_matrix, self.face_size, borderMode=border_mode, borderValue=(135, 133, 132)) # gray + self.cropped_faces.append(cropped_face) + # save the cropped face + if save_cropped_path is not None: + path = os.path.splitext(save_cropped_path)[0] + save_path = f'{path}_{idx:02d}.{self.save_ext}' + imwrite(cropped_face, save_path) + + def get_inverse_affine(self, save_inverse_affine_path=None): + """Get inverse affine matrix.""" + for idx, affine_matrix in enumerate(self.affine_matrices): + inverse_affine = cv2.invertAffineTransform(affine_matrix) + inverse_affine *= self.upscale_factor + self.inverse_affine_matrices.append(inverse_affine) + # save inverse affine matrices + if save_inverse_affine_path is not None: + path, _ = os.path.splitext(save_inverse_affine_path) + save_path = f'{path}_{idx:02d}.pth' + torch.save(inverse_affine, save_path) + + + def add_restored_face(self, face): + self.restored_faces.append(face) + + + def paste_faces_to_input_image(self, save_path=None, upsample_img=None, draw_box=False, face_upsampler=None): + h, w, _ = self.input_img.shape + h_up, w_up = int(h * self.upscale_factor), int(w * self.upscale_factor) + + if upsample_img is None: + # simply resize the background + # upsample_img = cv2.resize(self.input_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4) + upsample_img = cv2.resize(self.input_img, (w_up, h_up), interpolation=cv2.INTER_LINEAR) + else: + upsample_img = cv2.resize(upsample_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4) + + assert len(self.restored_faces) == len( + self.inverse_affine_matrices), ('length of restored_faces and affine_matrices are different.') + + inv_mask_borders = [] + for restored_face, inverse_affine in zip(self.restored_faces, self.inverse_affine_matrices): + if face_upsampler is not None: + restored_face = face_upsampler.enhance(restored_face, outscale=self.upscale_factor)[0] + inverse_affine /= self.upscale_factor + inverse_affine[:, 2] *= self.upscale_factor + face_size = (self.face_size[0]*self.upscale_factor, self.face_size[1]*self.upscale_factor) + else: + # Add an offset to inverse affine matrix, for more precise back alignment + if self.upscale_factor > 1: + extra_offset = 0.5 * self.upscale_factor + else: + extra_offset = 0 + inverse_affine[:, 2] += extra_offset + face_size = self.face_size + inv_restored = cv2.warpAffine(restored_face, inverse_affine, (w_up, h_up)) + + # if draw_box or not self.use_parse: # use square parse maps + # mask = np.ones(face_size, dtype=np.float32) + # inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up)) + # # remove the black borders + # inv_mask_erosion = cv2.erode( + # inv_mask, np.ones((int(2 * self.upscale_factor), int(2 * self.upscale_factor)), np.uint8)) + # pasted_face = inv_mask_erosion[:, :, None] * inv_restored + # total_face_area = np.sum(inv_mask_erosion) # // 3 + # # add border + # if draw_box: + # h, w = face_size + # mask_border = np.ones((h, w, 3), dtype=np.float32) + # border = int(1400/np.sqrt(total_face_area)) + # mask_border[border:h-border, border:w-border,:] = 0 + # inv_mask_border = cv2.warpAffine(mask_border, inverse_affine, (w_up, h_up)) + # inv_mask_borders.append(inv_mask_border) + # if not self.use_parse: + # # compute the fusion edge based on the area of face + # w_edge = int(total_face_area**0.5) // 20 + # erosion_radius = w_edge * 2 + # inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8)) + # blur_size = w_edge * 2 + # inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0) + # if len(upsample_img.shape) == 2: # upsample_img is gray image + # upsample_img = upsample_img[:, :, None] + # inv_soft_mask = inv_soft_mask[:, :, None] + + # always use square mask + mask = np.ones(face_size, dtype=np.float32) + inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up)) + # remove the black borders + inv_mask_erosion = cv2.erode( + inv_mask, np.ones((int(2 * self.upscale_factor), int(2 * self.upscale_factor)), np.uint8)) + pasted_face = inv_mask_erosion[:, :, None] * inv_restored + total_face_area = np.sum(inv_mask_erosion) # // 3 + # add border + if draw_box: + h, w = face_size + mask_border = np.ones((h, w, 3), dtype=np.float32) + border = int(1400/np.sqrt(total_face_area)) + mask_border[border:h-border, border:w-border,:] = 0 + inv_mask_border = cv2.warpAffine(mask_border, inverse_affine, (w_up, h_up)) + inv_mask_borders.append(inv_mask_border) + # compute the fusion edge based on the area of face + w_edge = int(total_face_area**0.5) // 20 + erosion_radius = w_edge * 2 + inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8)) + blur_size = w_edge * 2 + inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0) + if len(upsample_img.shape) == 2: # upsample_img is gray image + upsample_img = upsample_img[:, :, None] + inv_soft_mask = inv_soft_mask[:, :, None] + + # parse mask + if self.use_parse: + # inference + face_input = cv2.resize(restored_face, (512, 512), interpolation=cv2.INTER_LINEAR) + face_input = img2tensor(face_input.astype('float32') / 255., bgr2rgb=True, float32=True) + normalize(face_input, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True) + face_input = torch.unsqueeze(face_input, 0).to(self.device) + with torch.no_grad(): + out = self.face_parse(face_input)[0] + out = out.argmax(dim=1).squeeze().cpu().numpy() + + parse_mask = np.zeros(out.shape) + MASK_COLORMAP = [0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 0, 0, 0] + for idx, color in enumerate(MASK_COLORMAP): + parse_mask[out == idx] = color + # blur the mask + parse_mask = cv2.GaussianBlur(parse_mask, (101, 101), 11) + parse_mask = cv2.GaussianBlur(parse_mask, (101, 101), 11) + # remove the black borders + thres = 10 + parse_mask[:thres, :] = 0 + parse_mask[-thres:, :] = 0 + parse_mask[:, :thres] = 0 + parse_mask[:, -thres:] = 0 + parse_mask = parse_mask / 255. + + parse_mask = cv2.resize(parse_mask, face_size) + parse_mask = cv2.warpAffine(parse_mask, inverse_affine, (w_up, h_up), flags=3) + inv_soft_parse_mask = parse_mask[:, :, None] + # pasted_face = inv_restored + fuse_mask = (inv_soft_parse_mask 256: # 16-bit image + upsample_img = upsample_img.astype(np.uint16) + else: + upsample_img = upsample_img.astype(np.uint8) + + # draw bounding box + if draw_box: + # upsample_input_img = cv2.resize(input_img, (w_up, h_up)) + img_color = np.ones([*upsample_img.shape], dtype=np.float32) + img_color[:,:,0] = 0 + img_color[:,:,1] = 255 + img_color[:,:,2] = 0 + for inv_mask_border in inv_mask_borders: + upsample_img = inv_mask_border * img_color + (1 - inv_mask_border) * upsample_img + # upsample_input_img = inv_mask_border * img_color + (1 - inv_mask_border) * upsample_input_img + + if save_path is not None: + path = os.path.splitext(save_path)[0] + save_path = f'{path}.{self.save_ext}' + imwrite(upsample_img, save_path) + return upsample_img + + def clean_all(self): + self.all_landmarks_5 = [] + self.restored_faces = [] + self.affine_matrices = [] + self.cropped_faces = [] + self.inverse_affine_matrices = [] + self.det_faces = [] + self.pad_input_imgs = [] \ No newline at end of file diff --git a/repositories/CodeFormer/facelib/utils/face_utils.py b/repositories/CodeFormer/facelib/utils/face_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f1474a2a4419b6b62fab8a919ef805b802556464 --- /dev/null +++ b/repositories/CodeFormer/facelib/utils/face_utils.py @@ -0,0 +1,248 @@ +import cv2 +import numpy as np +import torch + + +def compute_increased_bbox(bbox, increase_area, preserve_aspect=True): + left, top, right, bot = bbox + width = right - left + height = bot - top + + if preserve_aspect: + width_increase = max(increase_area, ((1 + 2 * increase_area) * height - width) / (2 * width)) + height_increase = max(increase_area, ((1 + 2 * increase_area) * width - height) / (2 * height)) + else: + width_increase = height_increase = increase_area + left = int(left - width_increase * width) + top = int(top - height_increase * height) + right = int(right + width_increase * width) + bot = int(bot + height_increase * height) + return (left, top, right, bot) + + +def get_valid_bboxes(bboxes, h, w): + left = max(bboxes[0], 0) + top = max(bboxes[1], 0) + right = min(bboxes[2], w) + bottom = min(bboxes[3], h) + return (left, top, right, bottom) + + +def align_crop_face_landmarks(img, + landmarks, + output_size, + transform_size=None, + enable_padding=True, + return_inverse_affine=False, + shrink_ratio=(1, 1)): + """Align and crop face with landmarks. + + The output_size and transform_size are based on width. The height is + adjusted based on shrink_ratio_h/shring_ration_w. + + Modified from: + https://github.com/NVlabs/ffhq-dataset/blob/master/download_ffhq.py + + Args: + img (Numpy array): Input image. + landmarks (Numpy array): 5 or 68 or 98 landmarks. + output_size (int): Output face size. + transform_size (ing): Transform size. Usually the four time of + output_size. + enable_padding (float): Default: True. + shrink_ratio (float | tuple[float] | list[float]): Shring the whole + face for height and width (crop larger area). Default: (1, 1). + + Returns: + (Numpy array): Cropped face. + """ + lm_type = 'retinaface_5' # Options: dlib_5, retinaface_5 + + if isinstance(shrink_ratio, (float, int)): + shrink_ratio = (shrink_ratio, shrink_ratio) + if transform_size is None: + transform_size = output_size * 4 + + # Parse landmarks + lm = np.array(landmarks) + if lm.shape[0] == 5 and lm_type == 'retinaface_5': + eye_left = lm[0] + eye_right = lm[1] + mouth_avg = (lm[3] + lm[4]) * 0.5 + elif lm.shape[0] == 5 and lm_type == 'dlib_5': + lm_eye_left = lm[2:4] + lm_eye_right = lm[0:2] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = lm[4] + elif lm.shape[0] == 68: + lm_eye_left = lm[36:42] + lm_eye_right = lm[42:48] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = (lm[48] + lm[54]) * 0.5 + elif lm.shape[0] == 98: + lm_eye_left = lm[60:68] + lm_eye_right = lm[68:76] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = (lm[76] + lm[82]) * 0.5 + + eye_avg = (eye_left + eye_right) * 0.5 + eye_to_eye = eye_right - eye_left + eye_to_mouth = mouth_avg - eye_avg + + # Get the oriented crop rectangle + # x: half width of the oriented crop rectangle + x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] + # - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise + # norm with the hypotenuse: get the direction + x /= np.hypot(*x) # get the hypotenuse of a right triangle + rect_scale = 1 # TODO: you can edit it to get larger rect + x *= max(np.hypot(*eye_to_eye) * 2.0 * rect_scale, np.hypot(*eye_to_mouth) * 1.8 * rect_scale) + # y: half height of the oriented crop rectangle + y = np.flipud(x) * [-1, 1] + + x *= shrink_ratio[1] # width + y *= shrink_ratio[0] # height + + # c: center + c = eye_avg + eye_to_mouth * 0.1 + # quad: (left_top, left_bottom, right_bottom, right_top) + quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) + # qsize: side length of the square + qsize = np.hypot(*x) * 2 + + quad_ori = np.copy(quad) + # Shrink, for large face + # TODO: do we really need shrink + shrink = int(np.floor(qsize / output_size * 0.5)) + if shrink > 1: + h, w = img.shape[0:2] + rsize = (int(np.rint(float(w) / shrink)), int(np.rint(float(h) / shrink))) + img = cv2.resize(img, rsize, interpolation=cv2.INTER_AREA) + quad /= shrink + qsize /= shrink + + # Crop + h, w = img.shape[0:2] + border = max(int(np.rint(qsize * 0.1)), 3) + crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, w), min(crop[3] + border, h)) + if crop[2] - crop[0] < w or crop[3] - crop[1] < h: + img = img[crop[1]:crop[3], crop[0]:crop[2], :] + quad -= crop[0:2] + + # Pad + # pad: (width_left, height_top, width_right, height_bottom) + h, w = img.shape[0:2] + pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - w + border, 0), max(pad[3] - h + border, 0)) + if enable_padding and max(pad) > border - 4: + pad = np.maximum(pad, int(np.rint(qsize * 0.3))) + img = np.pad(img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect') + h, w = img.shape[0:2] + y, x, _ = np.ogrid[:h, :w, :1] + mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], + np.float32(w - 1 - x) / pad[2]), + 1.0 - np.minimum(np.float32(y) / pad[1], + np.float32(h - 1 - y) / pad[3])) + blur = int(qsize * 0.02) + if blur % 2 == 0: + blur += 1 + blur_img = cv2.boxFilter(img, 0, ksize=(blur, blur)) + + img = img.astype('float32') + img += (blur_img - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) + img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0) + img = np.clip(img, 0, 255) # float32, [0, 255] + quad += pad[:2] + + # Transform use cv2 + h_ratio = shrink_ratio[0] / shrink_ratio[1] + dst_h, dst_w = int(transform_size * h_ratio), transform_size + template = np.array([[0, 0], [0, dst_h], [dst_w, dst_h], [dst_w, 0]]) + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D(quad, template, method=cv2.LMEDS)[0] + cropped_face = cv2.warpAffine( + img, affine_matrix, (dst_w, dst_h), borderMode=cv2.BORDER_CONSTANT, borderValue=(135, 133, 132)) # gray + + if output_size < transform_size: + cropped_face = cv2.resize( + cropped_face, (output_size, int(output_size * h_ratio)), interpolation=cv2.INTER_LINEAR) + + if return_inverse_affine: + dst_h, dst_w = int(output_size * h_ratio), output_size + template = np.array([[0, 0], [0, dst_h], [dst_w, dst_h], [dst_w, 0]]) + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D( + quad_ori, np.array([[0, 0], [0, output_size], [dst_w, dst_h], [dst_w, 0]]), method=cv2.LMEDS)[0] + inverse_affine = cv2.invertAffineTransform(affine_matrix) + else: + inverse_affine = None + return cropped_face, inverse_affine + + +def paste_face_back(img, face, inverse_affine): + h, w = img.shape[0:2] + face_h, face_w = face.shape[0:2] + inv_restored = cv2.warpAffine(face, inverse_affine, (w, h)) + mask = np.ones((face_h, face_w, 3), dtype=np.float32) + inv_mask = cv2.warpAffine(mask, inverse_affine, (w, h)) + # remove the black borders + inv_mask_erosion = cv2.erode(inv_mask, np.ones((2, 2), np.uint8)) + inv_restored_remove_border = inv_mask_erosion * inv_restored + total_face_area = np.sum(inv_mask_erosion) // 3 + # compute the fusion edge based on the area of face + w_edge = int(total_face_area**0.5) // 20 + erosion_radius = w_edge * 2 + inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8)) + blur_size = w_edge * 2 + inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0) + img = inv_soft_mask * inv_restored_remove_border + (1 - inv_soft_mask) * img + # float32, [0, 255] + return img + + +if __name__ == '__main__': + import os + + from facelib.detection import init_detection_model + from facelib.utils.face_restoration_helper import get_largest_face + + img_path = '/home/wxt/datasets/ffhq/ffhq_wild/00009.png' + img_name = os.splitext(os.path.basename(img_path))[0] + + # initialize model + det_net = init_detection_model('retinaface_resnet50', half=False) + img_ori = cv2.imread(img_path) + h, w = img_ori.shape[0:2] + # if larger than 800, scale it + scale = max(h / 800, w / 800) + if scale > 1: + img = cv2.resize(img_ori, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_LINEAR) + + with torch.no_grad(): + bboxes = det_net.detect_faces(img, 0.97) + if scale > 1: + bboxes *= scale # the score is incorrect + bboxes = get_largest_face(bboxes, h, w)[0] + + landmarks = np.array([[bboxes[i], bboxes[i + 1]] for i in range(5, 15, 2)]) + + cropped_face, inverse_affine = align_crop_face_landmarks( + img_ori, + landmarks, + output_size=512, + transform_size=None, + enable_padding=True, + return_inverse_affine=True, + shrink_ratio=(1, 1)) + + cv2.imwrite(f'tmp/{img_name}_cropeed_face.png', cropped_face) + img = paste_face_back(img_ori, cropped_face, inverse_affine) + cv2.imwrite(f'tmp/{img_name}_back.png', img) diff --git a/repositories/CodeFormer/facelib/utils/misc.py b/repositories/CodeFormer/facelib/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..0918283c297a927fc0216670bbe78079087c6312 --- /dev/null +++ b/repositories/CodeFormer/facelib/utils/misc.py @@ -0,0 +1,141 @@ +import cv2 +import os +import os.path as osp +import torch +from torch.hub import download_url_to_file, get_dir +from urllib.parse import urlparse +# from basicsr.utils.download_util import download_file_from_google_drive +import gdown + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +def download_pretrained_models(file_ids, save_path_root): + os.makedirs(save_path_root, exist_ok=True) + + for file_name, file_id in file_ids.items(): + file_url = 'https://drive.google.com/uc?id='+file_id + save_path = osp.abspath(osp.join(save_path_root, file_name)) + if osp.exists(save_path): + user_response = input(f'{file_name} already exist. Do you want to cover it? Y/N\n') + if user_response.lower() == 'y': + print(f'Covering {file_name} to {save_path}') + gdown.download(file_url, save_path, quiet=False) + # download_file_from_google_drive(file_id, save_path) + elif user_response.lower() == 'n': + print(f'Skipping {file_name}') + else: + raise ValueError('Wrong input. Only accepts Y/N.') + else: + print(f'Downloading {file_name} to {save_path}') + gdown.download(file_url, save_path, quiet=False) + # download_file_from_google_drive(file_id, save_path) + + +def imwrite(img, file_path, params=None, auto_mkdir=True): + """Write image to file. + + Args: + img (ndarray): Image array to be written. + file_path (str): Image file path. + params (None or list): Same as opencv's :func:`imwrite` interface. + auto_mkdir (bool): If the parent folder of `file_path` does not exist, + whether to create it automatically. + + Returns: + bool: Successful or not. + """ + if auto_mkdir: + dir_name = os.path.abspath(os.path.dirname(file_path)) + os.makedirs(dir_name, exist_ok=True) + return cv2.imwrite(file_path, img, params) + + +def img2tensor(imgs, bgr2rgb=True, float32=True): + """Numpy array to tensor. + + Args: + imgs (list[ndarray] | ndarray): Input images. + bgr2rgb (bool): Whether to change bgr to rgb. + float32 (bool): Whether to change to float32. + + Returns: + list[tensor] | tensor: Tensor images. If returned results only have + one element, just return tensor. + """ + + def _totensor(img, bgr2rgb, float32): + if img.shape[2] == 3 and bgr2rgb: + if img.dtype == 'float64': + img = img.astype('float32') + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = torch.from_numpy(img.transpose(2, 0, 1)) + if float32: + img = img.float() + return img + + if isinstance(imgs, list): + return [_totensor(img, bgr2rgb, float32) for img in imgs] + else: + return _totensor(imgs, bgr2rgb, float32) + + +def load_file_from_url(url, model_dir=None, progress=True, file_name=None): + """Ref:https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py + """ + if model_dir is None: + hub_dir = get_dir() + model_dir = os.path.join(hub_dir, 'checkpoints') + + os.makedirs(os.path.join(ROOT_DIR, model_dir), exist_ok=True) + + parts = urlparse(url) + filename = os.path.basename(parts.path) + if file_name is not None: + filename = file_name + cached_file = os.path.abspath(os.path.join(ROOT_DIR, model_dir, filename)) + if not os.path.exists(cached_file): + print(f'Downloading: "{url}" to {cached_file}\n') + download_url_to_file(url, cached_file, hash_prefix=None, progress=progress) + return cached_file + + +def scandir(dir_path, suffix=None, recursive=False, full_path=False): + """Scan a directory to find the interested files. + Args: + dir_path (str): Path of the directory. + suffix (str | tuple(str), optional): File suffix that we are + interested in. Default: None. + recursive (bool, optional): If set to True, recursively scan the + directory. Default: False. + full_path (bool, optional): If set to True, include the dir_path. + Default: False. + Returns: + A generator for all the interested files with relative paths. + """ + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('"suffix" must be a string or tuple of strings') + + root = dir_path + + def _scandir(dir_path, suffix, recursive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + if full_path: + return_path = entry.path + else: + return_path = osp.relpath(entry.path, root) + + if suffix is None: + yield return_path + elif return_path.endswith(suffix): + yield return_path + else: + if recursive: + yield from _scandir(entry.path, suffix=suffix, recursive=recursive) + else: + continue + + return _scandir(dir_path, suffix=suffix, recursive=recursive) diff --git a/repositories/CodeFormer/inference_codeformer.py b/repositories/CodeFormer/inference_codeformer.py new file mode 100644 index 0000000000000000000000000000000000000000..fdfe8b301cc7c20c2fb653618e379d243603a108 --- /dev/null +++ b/repositories/CodeFormer/inference_codeformer.py @@ -0,0 +1,189 @@ +# Modified by Shangchen Zhou from: https://github.com/TencentARC/GFPGAN/blob/master/inference_gfpgan.py +import os +import cv2 +import argparse +import glob +import torch +from torchvision.transforms.functional import normalize +from basicsr.utils import imwrite, img2tensor, tensor2img +from basicsr.utils.download_util import load_file_from_url +from facelib.utils.face_restoration_helper import FaceRestoreHelper +import torch.nn.functional as F + +from basicsr.utils.registry import ARCH_REGISTRY + +pretrain_model_url = { + 'restoration': 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth', +} + +def set_realesrgan(): + if not torch.cuda.is_available(): # CPU + import warnings + warnings.warn('The unoptimized RealESRGAN is slow on CPU. We do not use it. ' + 'If you really want to use it, please modify the corresponding codes.', + category=RuntimeWarning) + bg_upsampler = None + else: + from basicsr.archs.rrdbnet_arch import RRDBNet + from basicsr.utils.realesrgan_utils import RealESRGANer + model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2) + bg_upsampler = RealESRGANer( + scale=2, + model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth', + model=model, + tile=args.bg_tile, + tile_pad=40, + pre_pad=0, + half=True) # need to set False in CPU mode + return bg_upsampler + +if __name__ == '__main__': + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + parser = argparse.ArgumentParser() + + parser.add_argument('--w', type=float, default=0.5, help='Balance the quality and fidelity') + parser.add_argument('--upscale', type=int, default=2, help='The final upsampling scale of the image. Default: 2') + parser.add_argument('--test_path', type=str, default='./inputs/cropped_faces') + parser.add_argument('--has_aligned', action='store_true', help='Input are cropped and aligned faces') + parser.add_argument('--only_center_face', action='store_true', help='Only restore the center face') + # large det_model: 'YOLOv5l', 'retinaface_resnet50' + # small det_model: 'YOLOv5n', 'retinaface_mobile0.25' + parser.add_argument('--detection_model', type=str, default='retinaface_resnet50') + parser.add_argument('--draw_box', action='store_true') + parser.add_argument('--bg_upsampler', type=str, default='None', help='background upsampler. Optional: realesrgan') + parser.add_argument('--face_upsample', action='store_true', help='face upsampler after enhancement.') + parser.add_argument('--bg_tile', type=int, default=400, help='Tile size for background sampler. Default: 400') + + args = parser.parse_args() + + # ------------------------ input & output ------------------------ + if args.test_path.endswith('/'): # solve when path ends with / + args.test_path = args.test_path[:-1] + + w = args.w + result_root = f'results/{os.path.basename(args.test_path)}_{w}' + + # ------------------ set up background upsampler ------------------ + if args.bg_upsampler == 'realesrgan': + bg_upsampler = set_realesrgan() + else: + bg_upsampler = None + + # ------------------ set up face upsampler ------------------ + if args.face_upsample: + if bg_upsampler is not None: + face_upsampler = bg_upsampler + else: + face_upsampler = set_realesrgan() + else: + face_upsampler = None + + # ------------------ set up CodeFormer restorer ------------------- + net = ARCH_REGISTRY.get('CodeFormer')(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9, + connect_list=['32', '64', '128', '256']).to(device) + + # ckpt_path = 'weights/CodeFormer/codeformer.pth' + ckpt_path = load_file_from_url(url=pretrain_model_url['restoration'], + model_dir='weights/CodeFormer', progress=True, file_name=None) + checkpoint = torch.load(ckpt_path)['params_ema'] + net.load_state_dict(checkpoint) + net.eval() + + # ------------------ set up FaceRestoreHelper ------------------- + # large det_model: 'YOLOv5l', 'retinaface_resnet50' + # small det_model: 'YOLOv5n', 'retinaface_mobile0.25' + if not args.has_aligned: + print(f'Face detection model: {args.detection_model}') + if bg_upsampler is not None: + print(f'Background upsampling: True, Face upsampling: {args.face_upsample}') + else: + print(f'Background upsampling: False, Face upsampling: {args.face_upsample}') + + face_helper = FaceRestoreHelper( + args.upscale, + face_size=512, + crop_ratio=(1, 1), + det_model = args.detection_model, + save_ext='png', + use_parse=True, + device=device) + + # -------------------- start to processing --------------------- + # scan all the jpg and png images + for img_path in sorted(glob.glob(os.path.join(args.test_path, '*.[jp][pn]g'))): + # clean all the intermediate results to process the next image + face_helper.clean_all() + + img_name = os.path.basename(img_path) + print(f'Processing: {img_name}') + basename, ext = os.path.splitext(img_name) + img = cv2.imread(img_path, cv2.IMREAD_COLOR) + + if args.has_aligned: + # the input faces are already cropped and aligned + img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR) + face_helper.cropped_faces = [img] + else: + face_helper.read_image(img) + # get face landmarks for each face + num_det_faces = face_helper.get_face_landmarks_5( + only_center_face=args.only_center_face, resize=640, eye_dist_threshold=5) + print(f'\tdetect {num_det_faces} faces') + # align and warp each face + face_helper.align_warp_face() + + # face restoration for each cropped face + for idx, cropped_face in enumerate(face_helper.cropped_faces): + # prepare data + cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True) + normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True) + cropped_face_t = cropped_face_t.unsqueeze(0).to(device) + + try: + with torch.no_grad(): + output = net(cropped_face_t, w=w, adain=True)[0] + restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1)) + del output + torch.cuda.empty_cache() + except Exception as error: + print(f'\tFailed inference for CodeFormer: {error}') + restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1)) + + restored_face = restored_face.astype('uint8') + face_helper.add_restored_face(restored_face) + + # paste_back + if not args.has_aligned: + # upsample the background + if bg_upsampler is not None: + # Now only support RealESRGAN for upsampling background + bg_img = bg_upsampler.enhance(img, outscale=args.upscale)[0] + else: + bg_img = None + face_helper.get_inverse_affine(None) + # paste each restored face to the input image + if args.face_upsample and face_upsampler is not None: + restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box, face_upsampler=face_upsampler) + else: + restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box) + + # save faces + for idx, (cropped_face, restored_face) in enumerate(zip(face_helper.cropped_faces, face_helper.restored_faces)): + # save cropped face + if not args.has_aligned: + save_crop_path = os.path.join(result_root, 'cropped_faces', f'{basename}_{idx:02d}.png') + imwrite(cropped_face, save_crop_path) + # save restored face + if args.has_aligned: + save_face_name = f'{basename}.png' + else: + save_face_name = f'{basename}_{idx:02d}.png' + save_restore_path = os.path.join(result_root, 'restored_faces', save_face_name) + imwrite(restored_face, save_restore_path) + + # save restored img + if not args.has_aligned and restored_img is not None: + save_restore_path = os.path.join(result_root, 'final_results', f'{basename}.png') + imwrite(restored_img, save_restore_path) + + print(f'\nAll results are saved in {result_root}') diff --git a/repositories/CodeFormer/inputs/cropped_faces/0143.png b/repositories/CodeFormer/inputs/cropped_faces/0143.png new file mode 100644 index 0000000000000000000000000000000000000000..065b3f001f43e7767308d3c2d2e5496cbe3660c7 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0143.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0240.png b/repositories/CodeFormer/inputs/cropped_faces/0240.png new file mode 100644 index 0000000000000000000000000000000000000000..7a117017a27defc68c875f97322fdf2850c0e50b Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0240.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0342.png b/repositories/CodeFormer/inputs/cropped_faces/0342.png new file mode 100644 index 0000000000000000000000000000000000000000..8f5aeeae1886ee3ec89dff52d30bcd2a8ad0ad4f Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0342.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0345.png b/repositories/CodeFormer/inputs/cropped_faces/0345.png new file mode 100644 index 0000000000000000000000000000000000000000..b8f71b6d8e437c1ffef045182ca0fcfb2f058e5c Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0345.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0368.png b/repositories/CodeFormer/inputs/cropped_faces/0368.png new file mode 100644 index 0000000000000000000000000000000000000000..262778a98196bd3f19f71ad49a9c81db417eae8c Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0368.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0412.png b/repositories/CodeFormer/inputs/cropped_faces/0412.png new file mode 100644 index 0000000000000000000000000000000000000000..c4a63b13df67ec8392c14daa204f167d2c664c98 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0412.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0444.png b/repositories/CodeFormer/inputs/cropped_faces/0444.png new file mode 100644 index 0000000000000000000000000000000000000000..9028dd0e3b65286a79e2a7538b5fb38b0f7a92ff Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0444.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0478.png b/repositories/CodeFormer/inputs/cropped_faces/0478.png new file mode 100644 index 0000000000000000000000000000000000000000..f0924061b574a9e2124c686db8e37995ebed850c Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0478.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0500.png b/repositories/CodeFormer/inputs/cropped_faces/0500.png new file mode 100644 index 0000000000000000000000000000000000000000..7a7146b473ba7cbec6eb65e7db6891ad1ceb62b7 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0500.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0599.png b/repositories/CodeFormer/inputs/cropped_faces/0599.png new file mode 100644 index 0000000000000000000000000000000000000000..ff26ccdaf1ef441e1366fe2d1b4fa7ea1b8f13a2 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0599.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0717.png b/repositories/CodeFormer/inputs/cropped_faces/0717.png new file mode 100644 index 0000000000000000000000000000000000000000..9342b5e55fd7d1698936628a7684d0c7ca2d8349 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0717.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0720.png b/repositories/CodeFormer/inputs/cropped_faces/0720.png new file mode 100644 index 0000000000000000000000000000000000000000..af384dce912c081073e4e5fed381dd2385159567 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0720.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0729.png b/repositories/CodeFormer/inputs/cropped_faces/0729.png new file mode 100644 index 0000000000000000000000000000000000000000..4f70f46e134775659d5ff40098acbafe6c1111c4 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0729.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0763.png b/repositories/CodeFormer/inputs/cropped_faces/0763.png new file mode 100644 index 0000000000000000000000000000000000000000..1263df7b03de9947281263ac79aaa7e8a1806860 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0763.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0770.png b/repositories/CodeFormer/inputs/cropped_faces/0770.png new file mode 100644 index 0000000000000000000000000000000000000000..40a64e832d7701c1bfcdc29e89eb1989487d009b Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0770.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0777.png b/repositories/CodeFormer/inputs/cropped_faces/0777.png new file mode 100644 index 0000000000000000000000000000000000000000..c72cb26ff64b7e06767e0252caa6c3f3f0538d27 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0777.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0885.png b/repositories/CodeFormer/inputs/cropped_faces/0885.png new file mode 100644 index 0000000000000000000000000000000000000000..f3ea2632f7671749c15981a966289fafb1765aa3 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0885.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/0934.png b/repositories/CodeFormer/inputs/cropped_faces/0934.png new file mode 100644 index 0000000000000000000000000000000000000000..bf82c2d3a36260d589687d21325700e9cf48a889 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/0934.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/Solvay_conference_1927_0018.png b/repositories/CodeFormer/inputs/cropped_faces/Solvay_conference_1927_0018.png new file mode 100644 index 0000000000000000000000000000000000000000..0f79547a80af7684e5f3b8bef8160ad9f1c85773 Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/Solvay_conference_1927_0018.png differ diff --git a/repositories/CodeFormer/inputs/cropped_faces/Solvay_conference_1927_2_16.png b/repositories/CodeFormer/inputs/cropped_faces/Solvay_conference_1927_2_16.png new file mode 100644 index 0000000000000000000000000000000000000000..f75b9602f3a8b2210fc459b22d9a67011404709e Binary files /dev/null and b/repositories/CodeFormer/inputs/cropped_faces/Solvay_conference_1927_2_16.png differ diff --git a/repositories/CodeFormer/inputs/whole_imgs/00.jpg b/repositories/CodeFormer/inputs/whole_imgs/00.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d6e323e56d782d566a024a1d25282996904ffebd Binary files /dev/null and b/repositories/CodeFormer/inputs/whole_imgs/00.jpg differ diff --git a/repositories/CodeFormer/inputs/whole_imgs/01.jpg b/repositories/CodeFormer/inputs/whole_imgs/01.jpg new file mode 100644 index 0000000000000000000000000000000000000000..485fc6a51a066a15ff8164150afef4028a304242 Binary files /dev/null and b/repositories/CodeFormer/inputs/whole_imgs/01.jpg differ diff --git a/repositories/CodeFormer/inputs/whole_imgs/02.png b/repositories/CodeFormer/inputs/whole_imgs/02.png new file mode 100644 index 0000000000000000000000000000000000000000..378e7b159223e49e9967de53d0c121558b9a56ef Binary files /dev/null and b/repositories/CodeFormer/inputs/whole_imgs/02.png differ diff --git a/repositories/CodeFormer/inputs/whole_imgs/03.jpg b/repositories/CodeFormer/inputs/whole_imgs/03.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b6c8428105e821801c07e6d4c8dbbc3c72814a58 Binary files /dev/null and b/repositories/CodeFormer/inputs/whole_imgs/03.jpg differ diff --git a/repositories/CodeFormer/inputs/whole_imgs/04.jpg b/repositories/CodeFormer/inputs/whole_imgs/04.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bb94681a7ff03ef57e0951a6c1fbfe6329571950 Binary files /dev/null and b/repositories/CodeFormer/inputs/whole_imgs/04.jpg differ diff --git a/repositories/CodeFormer/inputs/whole_imgs/05.jpg b/repositories/CodeFormer/inputs/whole_imgs/05.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4dc33735f9ad7c02b591810cc38c033f21be6c81 Binary files /dev/null and b/repositories/CodeFormer/inputs/whole_imgs/05.jpg differ diff --git a/repositories/CodeFormer/inputs/whole_imgs/06.png b/repositories/CodeFormer/inputs/whole_imgs/06.png new file mode 100644 index 0000000000000000000000000000000000000000..49c2fff2c655aaaddcd38920a063307c7ecc5152 Binary files /dev/null and b/repositories/CodeFormer/inputs/whole_imgs/06.png differ diff --git a/repositories/CodeFormer/predict.py b/repositories/CodeFormer/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..01ecc6799dee747257167516bfcd66b98efec925 --- /dev/null +++ b/repositories/CodeFormer/predict.py @@ -0,0 +1,188 @@ +""" +download checkpoints to ./weights beforehand +python scripts/download_pretrained_models.py facelib +python scripts/download_pretrained_models.py CodeFormer +wget 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth' +""" + +import tempfile +import cv2 +import torch +from torchvision.transforms.functional import normalize +from cog import BasePredictor, Input, Path + +from basicsr.utils import imwrite, img2tensor, tensor2img +from basicsr.archs.rrdbnet_arch import RRDBNet +from basicsr.utils.realesrgan_utils import RealESRGANer +from basicsr.utils.registry import ARCH_REGISTRY +from facelib.utils.face_restoration_helper import FaceRestoreHelper + + +class Predictor(BasePredictor): + def setup(self): + """Load the model into memory to make running multiple predictions efficient""" + self.device = "cuda:0" + self.bg_upsampler = set_realesrgan() + self.net = ARCH_REGISTRY.get("CodeFormer")( + dim_embd=512, + codebook_size=1024, + n_head=8, + n_layers=9, + connect_list=["32", "64", "128", "256"], + ).to(self.device) + ckpt_path = "weights/CodeFormer/codeformer.pth" + checkpoint = torch.load(ckpt_path)[ + "params_ema" + ] # update file permission if cannot load + self.net.load_state_dict(checkpoint) + self.net.eval() + + def predict( + self, + image: Path = Input(description="Input image"), + codeformer_fidelity: float = Input( + default=0.5, + ge=0, + le=1, + description="Balance the quality (lower number) and fidelity (higher number).", + ), + background_enhance: bool = Input( + description="Enhance background image with Real-ESRGAN", default=True + ), + face_upsample: bool = Input( + description="Upsample restored faces for high-resolution AI-created images", + default=True, + ), + upscale: int = Input( + description="The final upsampling scale of the image", + default=2, + ), + ) -> Path: + """Run a single prediction on the model""" + + # take the default setting for the demo + has_aligned = False + only_center_face = False + draw_box = False + detection_model = "retinaface_resnet50" + + self.face_helper = FaceRestoreHelper( + upscale, + face_size=512, + crop_ratio=(1, 1), + det_model=detection_model, + save_ext="png", + use_parse=True, + device=self.device, + ) + + bg_upsampler = self.bg_upsampler if background_enhance else None + face_upsampler = self.bg_upsampler if face_upsample else None + + img = cv2.imread(str(image), cv2.IMREAD_COLOR) + + if has_aligned: + # the input faces are already cropped and aligned + img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR) + self.face_helper.cropped_faces = [img] + else: + self.face_helper.read_image(img) + # get face landmarks for each face + num_det_faces = self.face_helper.get_face_landmarks_5( + only_center_face=only_center_face, resize=640, eye_dist_threshold=5 + ) + print(f"\tdetect {num_det_faces} faces") + # align and warp each face + self.face_helper.align_warp_face() + + # face restoration for each cropped face + for idx, cropped_face in enumerate(self.face_helper.cropped_faces): + # prepare data + cropped_face_t = img2tensor( + cropped_face / 255.0, bgr2rgb=True, float32=True + ) + normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True) + cropped_face_t = cropped_face_t.unsqueeze(0).to(self.device) + + try: + with torch.no_grad(): + output = self.net( + cropped_face_t, w=codeformer_fidelity, adain=True + )[0] + restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1)) + del output + torch.cuda.empty_cache() + except Exception as error: + print(f"\tFailed inference for CodeFormer: {error}") + restored_face = tensor2img( + cropped_face_t, rgb2bgr=True, min_max=(-1, 1) + ) + + restored_face = restored_face.astype("uint8") + self.face_helper.add_restored_face(restored_face) + + # paste_back + if not has_aligned: + # upsample the background + if bg_upsampler is not None: + # Now only support RealESRGAN for upsampling background + bg_img = bg_upsampler.enhance(img, outscale=upscale)[0] + else: + bg_img = None + self.face_helper.get_inverse_affine(None) + # paste each restored face to the input image + if face_upsample and face_upsampler is not None: + restored_img = self.face_helper.paste_faces_to_input_image( + upsample_img=bg_img, + draw_box=draw_box, + face_upsampler=face_upsampler, + ) + else: + restored_img = self.face_helper.paste_faces_to_input_image( + upsample_img=bg_img, draw_box=draw_box + ) + + # save restored img + out_path = Path(tempfile.mkdtemp()) / "output.png" + + if not has_aligned and restored_img is not None: + imwrite(restored_img, str(out_path)) + + return out_path + + +def imread(img_path): + img = cv2.imread(img_path) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + return img + + +def set_realesrgan(): + if not torch.cuda.is_available(): # CPU + import warnings + + warnings.warn( + "The unoptimized RealESRGAN is slow on CPU. We do not use it. " + "If you really want to use it, please modify the corresponding codes.", + category=RuntimeWarning, + ) + bg_upsampler = None + else: + model = RRDBNet( + num_in_ch=3, + num_out_ch=3, + num_feat=64, + num_block=23, + num_grow_ch=32, + scale=2, + ) + bg_upsampler = RealESRGANer( + scale=2, + model_path="./weights/RealESRGAN_x2plus.pth", + model=model, + tile=400, + tile_pad=40, + pre_pad=0, + half=True, + ) + return bg_upsampler diff --git a/repositories/CodeFormer/requirements.txt b/repositories/CodeFormer/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f97dfde85ebe83708fc1f6f7234a0ef69f18bde5 --- /dev/null +++ b/repositories/CodeFormer/requirements.txt @@ -0,0 +1,20 @@ +addict +future +lmdb +numpy +opencv-python +Pillow +pyyaml +requests +scikit-image +scipy +tb-nightly +torch>=1.7.1 +torchvision +tqdm +yapf +lpips +gdown # supports downloading the large file from Google Drive +# cmake +# dlib +# conda install -c conda-forge dlib \ No newline at end of file diff --git a/repositories/CodeFormer/scripts/crop_align_face.py b/repositories/CodeFormer/scripts/crop_align_face.py new file mode 100644 index 0000000000000000000000000000000000000000..31e66266ac0e5f818fa18b6409993151086bbc8b --- /dev/null +++ b/repositories/CodeFormer/scripts/crop_align_face.py @@ -0,0 +1,192 @@ +""" +brief: face alignment with FFHQ method (https://github.com/NVlabs/ffhq-dataset) +author: lzhbrian (https://lzhbrian.me) +link: https://gist.github.com/lzhbrian/bde87ab23b499dd02ba4f588258f57d5 +date: 2020.1.5 +note: code is heavily borrowed from + https://github.com/NVlabs/ffhq-dataset + http://dlib.net/face_landmark_detection.py.html +requirements: + conda install Pillow numpy scipy + conda install -c conda-forge dlib + # download face landmark model from: + # http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 +""" + +import cv2 +import dlib +import glob +import numpy as np +import os +import PIL +import PIL.Image +import scipy +import scipy.ndimage +import sys +import argparse + +# download model from: http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2 +predictor = dlib.shape_predictor('weights/dlib/shape_predictor_68_face_landmarks-fbdc2cb8.dat') + + +def get_landmark(filepath, only_keep_largest=True): + """get landmark with dlib + :return: np.array shape=(68, 2) + """ + detector = dlib.get_frontal_face_detector() + + img = dlib.load_rgb_image(filepath) + dets = detector(img, 1) + + # Shangchen modified + print("Number of faces detected: {}".format(len(dets))) + if only_keep_largest: + print('Detect several faces and only keep the largest.') + face_areas = [] + for k, d in enumerate(dets): + face_area = (d.right() - d.left()) * (d.bottom() - d.top()) + face_areas.append(face_area) + + largest_idx = face_areas.index(max(face_areas)) + d = dets[largest_idx] + shape = predictor(img, d) + print("Part 0: {}, Part 1: {} ...".format( + shape.part(0), shape.part(1))) + else: + for k, d in enumerate(dets): + print("Detection {}: Left: {} Top: {} Right: {} Bottom: {}".format( + k, d.left(), d.top(), d.right(), d.bottom())) + # Get the landmarks/parts for the face in box d. + shape = predictor(img, d) + print("Part 0: {}, Part 1: {} ...".format( + shape.part(0), shape.part(1))) + + t = list(shape.parts()) + a = [] + for tt in t: + a.append([tt.x, tt.y]) + lm = np.array(a) + # lm is a shape=(68,2) np.array + return lm + +def align_face(filepath, out_path): + """ + :param filepath: str + :return: PIL Image + """ + try: + lm = get_landmark(filepath) + except: + print('No landmark ...') + return + + lm_chin = lm[0:17] # left-right + lm_eyebrow_left = lm[17:22] # left-right + lm_eyebrow_right = lm[22:27] # left-right + lm_nose = lm[27:31] # top-down + lm_nostrils = lm[31:36] # top-down + lm_eye_left = lm[36:42] # left-clockwise + lm_eye_right = lm[42:48] # left-clockwise + lm_mouth_outer = lm[48:60] # left-clockwise + lm_mouth_inner = lm[60:68] # left-clockwise + + # Calculate auxiliary vectors. + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + eye_avg = (eye_left + eye_right) * 0.5 + eye_to_eye = eye_right - eye_left + mouth_left = lm_mouth_outer[0] + mouth_right = lm_mouth_outer[6] + mouth_avg = (mouth_left + mouth_right) * 0.5 + eye_to_mouth = mouth_avg - eye_avg + + # Choose oriented crop rectangle. + x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] + x /= np.hypot(*x) + x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8) + y = np.flipud(x) * [-1, 1] + c = eye_avg + eye_to_mouth * 0.1 + quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) + qsize = np.hypot(*x) * 2 + + # read image + img = PIL.Image.open(filepath) + + output_size = 512 + transform_size = 4096 + enable_padding = False + + # Shrink. + shrink = int(np.floor(qsize / output_size * 0.5)) + if shrink > 1: + rsize = (int(np.rint(float(img.size[0]) / shrink)), + int(np.rint(float(img.size[1]) / shrink))) + img = img.resize(rsize, PIL.Image.ANTIALIAS) + quad /= shrink + qsize /= shrink + + # Crop. + border = max(int(np.rint(qsize * 0.1)), 3) + crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), + int(np.ceil(max(quad[:, 0]))), int(np.ceil(max(quad[:, 1])))) + crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), + min(crop[2] + border, + img.size[0]), min(crop[3] + border, img.size[1])) + if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]: + img = img.crop(crop) + quad -= crop[0:2] + + # Pad. + pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), + int(np.ceil(max(quad[:, 0]))), int(np.ceil(max(quad[:, 1])))) + pad = (max(-pad[0] + border, + 0), max(-pad[1] + border, + 0), max(pad[2] - img.size[0] + border, + 0), max(pad[3] - img.size[1] + border, 0)) + if enable_padding and max(pad) > border - 4: + pad = np.maximum(pad, int(np.rint(qsize * 0.3))) + img = np.pad( + np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), + 'reflect') + h, w, _ = img.shape + y, x, _ = np.ogrid[:h, :w, :1] + mask = np.maximum( + 1.0 - + np.minimum(np.float32(x) / pad[0], + np.float32(w - 1 - x) / pad[2]), 1.0 - + np.minimum(np.float32(y) / pad[1], + np.float32(h - 1 - y) / pad[3])) + blur = qsize * 0.02 + img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - + img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) + img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0) + img = PIL.Image.fromarray( + np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB') + quad += pad[:2] + + img = img.transform((transform_size, transform_size), PIL.Image.QUAD, + (quad + 0.5).flatten(), PIL.Image.BILINEAR) + + if output_size < transform_size: + img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS) + + # Save aligned image. + print('saveing: ', out_path) + img.save(out_path) + + return img, np.max(quad[:, 0]) - np.min(quad[:, 0]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--in_dir', type=str, default='./inputs/whole_imgs') + parser.add_argument('--out_dir', type=str, default='./inputs/cropped_faces') + args = parser.parse_args() + + img_list = sorted(glob.glob(f'{args.in_dir}/*.png')) + img_list = sorted(img_list) + + for in_path in img_list: + out_path = os.path.join(args.out_dir, in_path.split("/")[-1]) + out_path = out_path.replace('.jpg', '.png') + size_ = align_face(in_path, out_path) \ No newline at end of file diff --git a/repositories/CodeFormer/scripts/download_pretrained_models.py b/repositories/CodeFormer/scripts/download_pretrained_models.py new file mode 100644 index 0000000000000000000000000000000000000000..daa6e8ca14ea91c89a318e85d9f182eb7d1bf025 --- /dev/null +++ b/repositories/CodeFormer/scripts/download_pretrained_models.py @@ -0,0 +1,40 @@ +import argparse +import os +from os import path as osp + +from basicsr.utils.download_util import load_file_from_url + + +def download_pretrained_models(method, file_urls): + save_path_root = f'./weights/{method}' + os.makedirs(save_path_root, exist_ok=True) + + for file_name, file_url in file_urls.items(): + save_path = load_file_from_url(url=file_url, model_dir=save_path_root, progress=True, file_name=file_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument( + 'method', + type=str, + help=("Options: 'CodeFormer' 'facelib'. Set to 'all' to download all the models.")) + args = parser.parse_args() + + file_urls = { + 'CodeFormer': { + 'codeformer.pth': 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth' + }, + 'facelib': { + # 'yolov5l-face.pth': 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/yolov5l-face.pth', + 'detection_Resnet50_Final.pth': 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/detection_Resnet50_Final.pth', + 'parsing_parsenet.pth': 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/parsing_parsenet.pth' + } + } + + if args.method == 'all': + for method in file_urls.keys(): + download_pretrained_models(method, file_urls[method]) + else: + download_pretrained_models(args.method, file_urls[args.method]) \ No newline at end of file diff --git a/repositories/CodeFormer/scripts/download_pretrained_models_from_gdrive.py b/repositories/CodeFormer/scripts/download_pretrained_models_from_gdrive.py new file mode 100644 index 0000000000000000000000000000000000000000..7df5be6fc260394ee9bbd0a7ae377e2ca657fe83 --- /dev/null +++ b/repositories/CodeFormer/scripts/download_pretrained_models_from_gdrive.py @@ -0,0 +1,60 @@ +import argparse +import os +from os import path as osp + +# from basicsr.utils.download_util import download_file_from_google_drive +import gdown + + +def download_pretrained_models(method, file_ids): + save_path_root = f'./weights/{method}' + os.makedirs(save_path_root, exist_ok=True) + + for file_name, file_id in file_ids.items(): + file_url = 'https://drive.google.com/uc?id='+file_id + save_path = osp.abspath(osp.join(save_path_root, file_name)) + if osp.exists(save_path): + user_response = input(f'{file_name} already exist. Do you want to cover it? Y/N\n') + if user_response.lower() == 'y': + print(f'Covering {file_name} to {save_path}') + gdown.download(file_url, save_path, quiet=False) + # download_file_from_google_drive(file_id, save_path) + elif user_response.lower() == 'n': + print(f'Skipping {file_name}') + else: + raise ValueError('Wrong input. Only accepts Y/N.') + else: + print(f'Downloading {file_name} to {save_path}') + gdown.download(file_url, save_path, quiet=False) + # download_file_from_google_drive(file_id, save_path) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument( + 'method', + type=str, + help=("Options: 'CodeFormer' 'facelib'. Set to 'all' to download all the models.")) + args = parser.parse_args() + + # file name: file id + # 'dlib': { + # 'mmod_human_face_detector-4cb19393.dat': '1qD-OqY8M6j4PWUP_FtqfwUPFPRMu6ubX', + # 'shape_predictor_5_face_landmarks-c4b1e980.dat': '1vF3WBUApw4662v9Pw6wke3uk1qxnmLdg', + # 'shape_predictor_68_face_landmarks-fbdc2cb8.dat': '1tJyIVdCHaU6IDMDx86BZCxLGZfsWB8yq' + # } + file_ids = { + 'CodeFormer': { + 'codeformer.pth': '1v_E_vZvP-dQPF55Kc5SRCjaKTQXDz-JB' + }, + 'facelib': { + 'yolov5l-face.pth': '131578zMA6B2x8VQHyHfa6GEPtulMCNzV', + 'parsing_parsenet.pth': '16pkohyZZ8ViHGBk3QtVqxLZKzdo466bK' + } + } + + if args.method == 'all': + for method in file_ids.keys(): + download_pretrained_models(method, file_ids[method]) + else: + download_pretrained_models(args.method, file_ids[args.method]) \ No newline at end of file diff --git a/repositories/CodeFormer/weights/CodeFormer/.gitkeep b/repositories/CodeFormer/weights/CodeFormer/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/repositories/CodeFormer/weights/README.md b/repositories/CodeFormer/weights/README.md new file mode 100644 index 0000000000000000000000000000000000000000..67ad334bd672eeb9f82813cd54e8885331bbb2f2 --- /dev/null +++ b/repositories/CodeFormer/weights/README.md @@ -0,0 +1,3 @@ +# Weights + +Put the downloaded pre-trained models to this folder. \ No newline at end of file diff --git a/repositories/CodeFormer/weights/facelib/.gitkeep b/repositories/CodeFormer/weights/facelib/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/repositories/generative-models/.gitignore b/repositories/generative-models/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0c70c5e906550d3b9a1017bd3b4516c154c92d70 --- /dev/null +++ b/repositories/generative-models/.gitignore @@ -0,0 +1,7 @@ +.pt2 +.pt2_2 +.pt13 +*.egg-info +build +/outputs +/checkpoints \ No newline at end of file diff --git a/repositories/generative-models/LICENSE b/repositories/generative-models/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..b01c5a6838f0c01a347aaf83e2fde3b59a36006d --- /dev/null +++ b/repositories/generative-models/LICENSE @@ -0,0 +1,75 @@ +SDXL 0.9 RESEARCH LICENSE AGREEMENT +Copyright (c) Stability AI Ltd. +This License Agreement (as may be amended in accordance with this License Agreement, “License”), between you, or your employer or other entity (if you are entering into this agreement on behalf of your employer or other entity) (“Licensee” or “you”) and Stability AI Ltd. (“Stability AI” or “we”) applies to your use of any computer program, algorithm, source code, object code, or software that is made available by Stability AI under this License (“Software”) and any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software (“Documentation”). +By clicking “I Accept” below or by using the Software, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to use the Software or Documentation (collectively, the “Software Products”), and you must immediately cease using the Software Products. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to Stability AI that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the Software Products on behalf of your employer or other entity. +1. LICENSE GRANT + +a. Subject to your compliance with the Documentation and Sections 2, 3, and 5, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s copyright interests to reproduce, distribute, and create derivative works of the Software solely for your non-commercial research purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Stability AI’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License. + +b. You may make a reasonable number of copies of the Documentation solely for use in connection with the license to the Software granted above. + +c. The grant of rights expressly set forth in this Section 1 (License Grant) are the complete grant of rights to you in the Software Products, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Stability AI and its licensors reserve all rights not expressly granted by this License. + + +2. RESTRICTIONS + +You will not, and will not permit, assist or cause any third party to: + +a. use, modify, copy, reproduce, create derivative works of, or distribute the Software Products (or any derivative works thereof, works incorporating the Software Products, or any data produced by the Software), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes or in the service of nuclear technology, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing; + +b. alter or remove copyright and other proprietary notices which appear on or in the Software Products; + +c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Stability AI in connection with the Software, or to circumvent or remove any usage restrictions, or to enable functionality disabled by Stability AI; or + +d. offer or impose any terms on the Software Products that alter, restrict, or are inconsistent with the terms of this License. + +e. 1) violate any applicable U.S. and non-U.S. export control and trade sanctions laws (“Export Laws”); 2) directly or indirectly export, re-export, provide, or otherwise transfer Software Products: (a) to any individual, entity, or country prohibited by Export Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Export Laws, including nuclear, chemical or biological weapons, or missile technology applications; 3) use or download Software Products if you or they are: (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) for any purpose prohibited by Export Laws; and (4) will not disguise your location through IP proxying or other methods. + + +3. ATTRIBUTION + +Together with any copies of the Software Products (as well as derivative works thereof or works incorporating the Software Products) that you distribute, you must provide (i) a copy of this License, and (ii) the following attribution notice: “SDXL 0.9 is licensed under the SDXL Research License, Copyright (c) Stability AI Ltd. All Rights Reserved.” + + +4. DISCLAIMERS + +THE SOFTWARE PRODUCTS ARE PROVIDED “AS IS” AND “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. STABILITY AIEXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE SOFTWARE PRODUCTS, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. STABILITY AI MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE SOFTWARE PRODUCTS WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS. + + +5. LIMITATION OF LIABILITY + +TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL STABILITY AI BE LIABLE TO YOU (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF STABILITY AI HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE SOFTWARE PRODUCTS, THEIR CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, “SOFTWARE MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE SOFTWARE MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE SOFTWARE MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE SOFTWARE MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE. + + +6. INDEMNIFICATION + +You will indemnify, defend and hold harmless Stability AI and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Stability AI Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Stability AI Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to: (a) your access to or use of the Software Products (as well as any results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Stability AI Parties of any such Claims, and cooperate with Stability AI Parties in defending such Claims. You will also grant the Stability AI Parties sole control of the defense or settlement, at Stability AI’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Stability AI or the other Stability AI Parties. + + +7. TERMINATION; SURVIVAL + +a. This License will automatically terminate upon any breach by you of the terms of this License. + +b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you. + +c. The following sections survive termination of this License: 2 (Restrictions), 3 (Attribution), 4 (Disclaimers), 5 (Limitation on Liability), 6 (Indemnification) 7 (Termination; Survival), 8 (Third Party Materials), 9 (Trademarks), 10 (Applicable Law; Dispute Resolution), and 11 (Miscellaneous). + + +8. THIRD PARTY MATERIALS + +The Software Products may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Stability AI does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk. + + +9. TRADEMARKS + +Licensee has not been granted any trademark license as part of this License and may not use any name or mark associated with Stability AI without the prior written permission of Stability AI, except to the extent necessary to make the reference required by the “ATTRIBUTION” section of this Agreement. + + +10. APPLICABLE LAW; DISPUTE RESOLUTION + +This License will be governed and construed under the laws of the State of California without regard to conflicts of law provisions. Any suit or proceeding arising out of or relating to this License will be brought in the federal or state courts, as applicable, in San Mateo County, California, and each party irrevocably submits to the jurisdiction and venue of such courts. + + +11. MISCELLANEOUS + +If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Stability AI to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the Documentation, contains the entire understanding between you and Stability AI regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Stability AI regarding such subject matter. No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Stability AI. \ No newline at end of file diff --git a/repositories/generative-models/README.md b/repositories/generative-models/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9293884bf67772d298fe1de8ac405848f22720ae --- /dev/null +++ b/repositories/generative-models/README.md @@ -0,0 +1,194 @@ +# Generative Models by Stability AI + +![sample1](assets/000.jpg) + +## News + +**July 4, 2023** +- A technical report on SDXL is now available [here](assets/sdxl_report.pdf). + +**June 22, 2023** + + +- We are releasing two new diffusion models for research purposes: + - `SD-XL 0.9-base`: The base model was trained on a variety of aspect ratios on images with resolution 1024^2. The base model uses [OpenCLIP-ViT/G](https://github.com/mlfoundations/open_clip) and [CLIP-ViT/L](https://github.com/openai/CLIP/tree/main) for text encoding whereas the refiner model only uses the OpenCLIP model. + - `SD-XL 0.9-refiner`: The refiner has been trained to denoise small noise levels of high quality data and as such is not expected to work as a text-to-image model; instead, it should only be used as an image-to-image model. + +If you would like to access these models for your research, please apply using one of the following links: +[SDXL-0.9-Base model](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9), and [SDXL-0.9-Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). +This means that you can apply for any of the two links - and if you are granted - you can access both. +Please log in to your Hugging Face Account with your organization email to request access. +**We plan to do a full release soon (July).** + +## The codebase + +### General Philosophy + +Modularity is king. This repo implements a config-driven approach where we build and combine submodules by calling `instantiate_from_config()` on objects defined in yaml configs. See `configs/` for many examples. + +### Changelog from the old `ldm` codebase + +For training, we use [pytorch-lightning](https://www.pytorchlightning.ai/index.html), but it should be easy to use other training wrappers around the base modules. The core diffusion model class (formerly `LatentDiffusion`, now `DiffusionEngine`) has been cleaned up: + +- No more extensive subclassing! We now handle all types of conditioning inputs (vectors, sequences and spatial conditionings, and all combinations thereof) in a single class: `GeneralConditioner`, see `sgm/modules/encoders/modules.py`. +- We separate guiders (such as classifier-free guidance, see `sgm/modules/diffusionmodules/guiders.py`) from the + samplers (`sgm/modules/diffusionmodules/sampling.py`), and the samplers are independent of the model. +- We adopt the ["denoiser framework"](https://arxiv.org/abs/2206.00364) for both training and inference (most notable change is probably now the option to train continuous time models): + * Discrete times models (denoisers) are simply a special case of continuous time models (denoisers); see `sgm/modules/diffusionmodules/denoiser.py`. + * The following features are now independent: weighting of the diffusion loss function (`sgm/modules/diffusionmodules/denoiser_weighting.py`), preconditioning of the network (`sgm/modules/diffusionmodules/denoiser_scaling.py`), and sampling of noise levels during training (`sgm/modules/diffusionmodules/sigma_sampling.py`). +- Autoencoding models have also been cleaned up. + +## Installation: + + +#### 1. Clone the repo + +```shell +git clone git@github.com:Stability-AI/generative-models.git +cd generative-models +``` + +#### 2. Setting up the virtualenv + +This is assuming you have navigated to the `generative-models` root after cloning it. + +**NOTE:** This is tested under `python3.8` and `python3.10`. For other python versions, you might encounter version conflicts. + + +**PyTorch 1.13** + +```shell +# install required packages from pypi +python3 -m venv .pt1 +source .pt1/bin/activate +pip3 install wheel +pip3 install -r requirements_pt13.txt +``` + +**PyTorch 2.0** + + +```shell +# install required packages from pypi +python3 -m venv .pt2 +source .pt2/bin/activate +pip3 install wheel +pip3 install -r requirements_pt2.txt +``` + +## Inference: + +We provide a [streamlit](https://streamlit.io/) demo for text-to-image and image-to-image sampling in `scripts/demo/sampling.py`. The following models are currently supported: +- [SD-XL 0.9-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9) +- [SD-XL 0.9-refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9) +- [SD 2.1-512](https://huggingface.co/stabilityai/stable-diffusion-2-1-base/blob/main/v2-1_512-ema-pruned.safetensors) +- [SD 2.1-768](https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors) + +**Weights for SDXL**: +If you would like to access these models for your research, please apply using one of the following links: +[SDXL-0.9-Base model](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9), and [SDXL-0.9-Refiner](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). +This means that you can apply for any of the two links - and if you are granted - you can access both. +Please log in to your Hugging Face Account with your organization email to request access. + +After obtaining the weights, place them into `checkpoints/`. +Next, start the demo using + +``` +streamlit run scripts/demo/sampling.py --server.port +``` + +### Invisible Watermark Detection + +Images generated with our code use the +[invisible-watermark](https://github.com/ShieldMnt/invisible-watermark/) +library to embed an invisible watermark into the model output. We also provide +a script to easily detect that watermark. Please note that this watermark is +not the same as in previous Stable Diffusion 1.x/2.x versions. + +To run the script you need to either have a working installation as above or +try an _experimental_ import using only a minimal amount of packages: +```bash +python -m venv .detect +source .detect/bin/activate + +pip install "numpy>=1.17" "PyWavelets>=1.1.1" "opencv-python>=4.1.0.25" +pip install --no-deps invisible-watermark +``` + +To run the script you need to have a working installation as above. The script +is then useable in the following ways (don't forget to activate your +virtual environment beforehand, e.g. `source .pt1/bin/activate`): +```bash +# test a single file +python scripts/demo/detect.py +# test multiple files at once +python scripts/demo/detect.py ... +# test all files in a specific folder +python scripts/demo/detect.py /* +``` + +## Training: + +We are providing example training configs in `configs/example_training`. To launch a training, run + +``` +python main.py --base configs/ configs/ +``` + +where configs are merged from left to right (later configs overwrite the same values). +This can be used to combine model, training and data configs. However, all of them can also be +defined in a single config. For example, to run a class-conditional pixel-based diffusion model training on MNIST, +run + +```bash +python main.py --base configs/example_training/toy/mnist_cond.yaml +``` + +**NOTE 1:** Using the non-toy-dataset configs `configs/example_training/imagenet-f8_cond.yaml`, `configs/example_training/txt2img-clipl.yaml` and `configs/example_training/txt2img-clipl-legacy-ucg-training.yaml` for training will require edits depending on the used dataset (which is expected to stored in tar-file in the [webdataset-format](https://github.com/webdataset/webdataset)). To find the parts which have to be adapted, search for comments containing `USER:` in the respective config. + +**NOTE 2:** This repository supports both `pytorch1.13` and `pytorch2`for training generative models. However for autoencoder training as e.g. in `configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml`, only `pytorch1.13` is supported. + +**NOTE 3:** Training latent generative models (as e.g. in `configs/example_training/imagenet-f8_cond.yaml`) requires retrieving the checkpoint from [Hugging Face](https://huggingface.co/stabilityai/sdxl-vae/tree/main) and replacing the `CKPT_PATH` placeholder in [this line](configs/example_training/imagenet-f8_cond.yaml#81). The same is to be done for the provided text-to-image configs. + +### Building New Diffusion Models + +#### Conditioner + +The `GeneralConditioner` is configured through the `conditioner_config`. Its only attribute is `emb_models`, a list of +different embedders (all inherited from `AbstractEmbModel`) that are used to condition the generative model. +All embedders should define whether or not they are trainable (`is_trainable`, default `False`), a classifier-free +guidance dropout rate is used (`ucg_rate`, default `0`), and an input key (`input_key`), for example, `txt` for text-conditioning or `cls` for class-conditioning. +When computing conditionings, the embedder will get `batch[input_key]` as input. +We currently support two to four dimensional conditionings and conditionings of different embedders are concatenated +appropriately. +Note that the order of the embedders in the `conditioner_config` is important. + +#### Network + +The neural network is set through the `network_config`. This used to be called `unet_config`, which is not general +enough as we plan to experiment with transformer-based diffusion backbones. + +#### Loss + +The loss is configured through `loss_config`. For standard diffusion model training, you will have to set `sigma_sampler_config`. + +#### Sampler config + +As discussed above, the sampler is independent of the model. In the `sampler_config`, we set the type of numerical +solver, number of steps, type of discretization, as well as, for example, guidance wrappers for classifier-free +guidance. + +### Dataset Handling + + +For large scale training we recommend using the data pipelines from our [data pipelines](https://github.com/Stability-AI/datapipelines) project. The project is contained in the requirement and automatically included when following the steps from the [Installation section](#installation). +Small map-style datasets should be defined here in the repository (e.g., MNIST, CIFAR-10, ...), and return a dict of +data keys/values, +e.g., + +```python +example = {"jpg": x, # this is a tensor -1...1 chw + "txt": "a beautiful image"} +``` + +where we expect images in -1...1, channel-first format. diff --git a/repositories/generative-models/assets/000.jpg b/repositories/generative-models/assets/000.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e93d6c1b6e07edcb9b5efa946fc3092fe8b4e9b7 Binary files /dev/null and b/repositories/generative-models/assets/000.jpg differ diff --git a/repositories/generative-models/assets/sdxl_report.pdf b/repositories/generative-models/assets/sdxl_report.pdf new file mode 100644 index 0000000000000000000000000000000000000000..839136bd760b6d0435915a89a797055727ec764b --- /dev/null +++ b/repositories/generative-models/assets/sdxl_report.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d63345686bc36e6f6de1c20610a7657fafba4f24a9e892ea6f0b9a9f36b5c00 +size 18172854 diff --git a/repositories/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml b/repositories/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..482b25901b749731901aa54a9d07888e8f74a08b --- /dev/null +++ b/repositories/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml @@ -0,0 +1,115 @@ +model: + base_learning_rate: 4.5e-6 + target: sgm.models.autoencoder.AutoencodingEngine + params: + input_key: jpg + monitor: val/rec_loss + + loss_config: + target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator + params: + perceptual_weight: 0.25 + disc_start: 20001 + disc_weight: 0.5 + learn_logvar: True + + regularization_weights: + kl_loss: 1.0 + + regularizer_config: + target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer + + encoder_config: + target: sgm.modules.diffusionmodules.model.Encoder + params: + attn_type: none + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 4 ] + num_res_blocks: 4 + attn_resolutions: [ ] + dropout: 0.0 + + decoder_config: + target: sgm.modules.diffusionmodules.model.Decoder + params: + attn_type: none + double_z: False + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 4 ] + num_res_blocks: 4 + attn_resolutions: [ ] + dropout: 0.0 + +data: + target: sgm.data.dataset.StableDataModuleFromConfig + params: + train: + datapipeline: + urls: + - "DATA-PATH" + pipeline_config: + shardshuffle: 10000 + sample_shuffle: 10000 + + decoders: + - "pil" + + postprocessors: + - target: sdata.mappers.TorchVisionImageTransforms + params: + key: 'jpg' + transforms: + - target: torchvision.transforms.Resize + params: + size: 256 + interpolation: 3 + - target: torchvision.transforms.ToTensor + - target: sdata.mappers.Rescaler + - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare + params: + h_key: height + w_key: width + + loader: + batch_size: 8 + num_workers: 4 + + +lightning: + strategy: + target: pytorch_lightning.strategies.DDPStrategy + params: + find_unused_parameters: True + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 50000 + + image_logger: + target: main.ImageLogger + params: + enable_autocast: False + batch_frequency: 1000 + max_images: 8 + increase_log_steps: True + + trainer: + devices: 0, + limit_val_batches: 50 + benchmark: True + accumulate_grad_batches: 1 + val_check_interval: 10000 \ No newline at end of file diff --git a/repositories/generative-models/configs/example_training/imagenet-f8_cond.yaml b/repositories/generative-models/configs/example_training/imagenet-f8_cond.yaml new file mode 100644 index 0000000000000000000000000000000000000000..60627331bc57349414a49ac5b9a04893fbe3f2be --- /dev/null +++ b/repositories/generative-models/configs/example_training/imagenet-f8_cond.yaml @@ -0,0 +1,188 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + log_keys: + - cls + + scheduler_config: + target: sgm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [10000] + cycle_lengths: [10000000000000] + f_start: [1.e-6] + f_max: [1.] + f_min: [1.] + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: True + in_channels: 4 + out_channels: 4 + model_channels: 256 + attention_resolutions: [1, 2, 4] + num_res_blocks: 2 + channel_mult: [1, 2, 4] + num_head_channels: 64 + num_classes: sequential + adm_in_channels: 1024 + use_spatial_transformer: true + transformer_depth: 1 + context_dim: 1024 + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + # crossattn cond + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + add_sequence_dim: True # will be used through crossattn then + embed_dim: 1024 + n_classes: 1000 + # vector cond + - is_trainable: False + ucg_rate: 0.2 + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: crop_coords_top_left + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper + params: + ckpt_path: CKPT_PATH + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling + params: + num_idx: 1000 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 5.0 + +data: + target: sgm.data.dataset.StableDataModuleFromConfig + params: + train: + datapipeline: + urls: + # USER: adapt this path the root of your custom dataset + - "DATA_PATH" + pipeline_config: + shardshuffle: 10000 + sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM + + decoders: + - "pil" + + postprocessors: + - target: sdata.mappers.TorchVisionImageTransforms + params: + key: 'jpg' # USER: you might wanna adapt this for your custom dataset + transforms: + - target: torchvision.transforms.Resize + params: + size: 256 + interpolation: 3 + - target: torchvision.transforms.ToTensor + - target: sdata.mappers.Rescaler + + - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare + params: + h_key: height # USER: you might wanna adapt this for your custom dataset + w_key: width # USER: you might wanna adapt this for your custom dataset + + loader: + batch_size: 64 + num_workers: 6 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + enable_autocast: False + batch_frequency: 1000 + max_images: 8 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 8 + n_rows: 2 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 1000 \ No newline at end of file diff --git a/repositories/generative-models/configs/example_training/toy/cifar10_cond.yaml b/repositories/generative-models/configs/example_training/toy/cifar10_cond.yaml new file mode 100644 index 0000000000000000000000000000000000000000..36ba2527a0a8364b91052ef40a3af099996fdff5 --- /dev/null +++ b/repositories/generative-models/configs/example_training/toy/cifar10_cond.yaml @@ -0,0 +1,99 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EDMWeighting + params: + sigma_data: 1.0 + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + in_channels: 3 + out_channels: 3 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + num_classes: sequential + adm_in_channels: 128 + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + embed_dim: 128 + n_classes: 10 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 3.0 + +data: + target: sgm.data.cifar10.CIFAR10Loader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 64 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 64 + n_rows: 8 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 20 \ No newline at end of file diff --git a/repositories/generative-models/configs/example_training/toy/mnist.yaml b/repositories/generative-models/configs/example_training/toy/mnist.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44d8e6fea88155ad9e8bcd9724d0f074f6796798 --- /dev/null +++ b/repositories/generative-models/configs/example_training/toy/mnist.yaml @@ -0,0 +1,80 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EDMWeighting + params: + sigma_data: 1.0 + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + in_channels: 1 + out_channels: 1 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + +data: + target: sgm.data.mnist.MNISTLoader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 64 + increase_log_steps: False + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 64 + n_rows: 8 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 10 \ No newline at end of file diff --git a/repositories/generative-models/configs/example_training/toy/mnist_cond.yaml b/repositories/generative-models/configs/example_training/toy/mnist_cond.yaml new file mode 100644 index 0000000000000000000000000000000000000000..557be128b9493428e3378621173493588069a780 --- /dev/null +++ b/repositories/generative-models/configs/example_training/toy/mnist_cond.yaml @@ -0,0 +1,99 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EDMWeighting + params: + sigma_data: 1.0 + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + in_channels: 1 + out_channels: 1 + model_channels: 32 + attention_resolutions: [ ] + num_res_blocks: 4 + channel_mult: [ 1, 2, 2 ] + num_head_channels: 32 + num_classes: sequential + adm_in_channels: 128 + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: "cls" + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + embed_dim: 128 + n_classes: 10 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 3.0 + +data: + target: sgm.data.mnist.MNISTLoader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 16 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 16 + n_rows: 4 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 20 \ No newline at end of file