Spaces:

ujin-song
/

ortha

Sleeping

App Files Files Community

ujin-song commited on Jun 3

Commit

8e12b4e

•

1 Parent(s): 1c6e4b7

upload mixofshow and orthogonal_mats folder

Browse files

Files changed (50) hide show

mixofshow/.DS_Store +0 -0
mixofshow/data/__init__.py +0 -0
mixofshow/data/__pycache__/__init__.cpython-38.pyc +0 -0
mixofshow/data/__pycache__/__init__.cpython-39.pyc +0 -0
mixofshow/data/__pycache__/lora_dataset.cpython-38.pyc +0 -0
mixofshow/data/__pycache__/lora_dataset.cpython-39.pyc +0 -0
mixofshow/data/__pycache__/pil_transform.cpython-38.pyc +0 -0
mixofshow/data/__pycache__/pil_transform.cpython-39.pyc +0 -0
mixofshow/data/__pycache__/prompt_dataset.cpython-38.pyc +0 -0
mixofshow/data/__pycache__/prompt_dataset.cpython-39.pyc +0 -0
mixofshow/data/lora_dataset.py +102 -0
mixofshow/data/pil_transform.py +366 -0
mixofshow/data/prompt_dataset.py +67 -0
mixofshow/models/__pycache__/edlora.cpython-310.pyc +0 -0
mixofshow/models/__pycache__/edlora.cpython-38.pyc +0 -0
mixofshow/models/__pycache__/edlora.cpython-39.pyc +0 -0
mixofshow/models/edlora.py +259 -0
mixofshow/pipelines/__pycache__/pipeline_edlora.cpython-310.pyc +0 -0
mixofshow/pipelines/__pycache__/pipeline_edlora.cpython-38.pyc +0 -0
mixofshow/pipelines/__pycache__/pipeline_edlora.cpython-39.pyc +0 -0
mixofshow/pipelines/__pycache__/pipeline_regionally_t2iadapter.cpython-310.pyc +0 -0
mixofshow/pipelines/__pycache__/pipeline_regionally_t2iadapter.cpython-38.pyc +0 -0
mixofshow/pipelines/__pycache__/pipeline_regionally_t2iadapter.cpython-39.pyc +0 -0
mixofshow/pipelines/__pycache__/trainer_edlora.cpython-38.pyc +0 -0
mixofshow/pipelines/__pycache__/trainer_edlora.cpython-39.pyc +0 -0
mixofshow/pipelines/pipeline_edlora.py +322 -0
mixofshow/pipelines/pipeline_regionally_t2iadapter.py +608 -0
mixofshow/pipelines/trainer_edlora.py +380 -0
mixofshow/utils/__init__.py +0 -0
mixofshow/utils/__pycache__/__init__.cpython-310.pyc +0 -0
mixofshow/utils/__pycache__/__init__.cpython-38.pyc +0 -0
mixofshow/utils/__pycache__/__init__.cpython-39.pyc +0 -0
mixofshow/utils/__pycache__/convert_edlora_to_diffusers.cpython-38.pyc +0 -0
mixofshow/utils/__pycache__/convert_edlora_to_diffusers.cpython-39.pyc +0 -0
mixofshow/utils/__pycache__/ptp_util.cpython-38.pyc +0 -0
mixofshow/utils/__pycache__/ptp_util.cpython-39.pyc +0 -0
mixofshow/utils/__pycache__/registry.cpython-38.pyc +0 -0
mixofshow/utils/__pycache__/registry.cpython-39.pyc +0 -0
mixofshow/utils/__pycache__/util.cpython-310.pyc +0 -0
mixofshow/utils/__pycache__/util.cpython-38.pyc +0 -0
mixofshow/utils/__pycache__/util.cpython-39.pyc +0 -0
mixofshow/utils/arial.ttf +0 -0
mixofshow/utils/convert_edlora_to_diffusers.py +99 -0
mixofshow/utils/ptp_util.py +200 -0
mixofshow/utils/registry.py +79 -0
mixofshow/utils/util.py +313 -0
orthogonal_mats/1280.npy +3 -0
orthogonal_mats/320.npy +3 -0
orthogonal_mats/640.npy +3 -0
orthogonal_mats/768.npy +3 -0

mixofshow/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

mixofshow/data/__init__.py ADDED Viewed

File without changes

mixofshow/data/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (148 Bytes). View file

mixofshow/data/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (148 Bytes). View file

mixofshow/data/__pycache__/lora_dataset.cpython-38.pyc ADDED Viewed

Binary file (3.02 kB). View file

mixofshow/data/__pycache__/lora_dataset.cpython-39.pyc ADDED Viewed

Binary file (3.07 kB). View file

mixofshow/data/__pycache__/pil_transform.cpython-38.pyc ADDED Viewed

Binary file (10.9 kB). View file

mixofshow/data/__pycache__/pil_transform.cpython-39.pyc ADDED Viewed

Binary file (10.8 kB). View file

mixofshow/data/__pycache__/prompt_dataset.cpython-38.pyc ADDED Viewed

Binary file (2.35 kB). View file

mixofshow/data/__pycache__/prompt_dataset.cpython-39.pyc ADDED Viewed

Binary file (2.36 kB). View file

mixofshow/data/lora_dataset.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import json
+import os
+import random
+import re
+from pathlib import Path
+from PIL import Image
+from torch.utils.data import Dataset
+from mixofshow.data.pil_transform import PairCompose, build_transform
+class LoraDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+    def __init__(self, opt):
+        self.opt = opt
+        self.instance_images_path = []
+        with open(opt['concept_list'], 'r') as f:
+            concept_list = json.load(f)
+        replace_mapping = opt.get('replace_mapping', {})
+        use_caption = opt.get('use_caption', False)
+        use_mask = opt.get('use_mask', False)
+        for concept in concept_list:
+            instance_prompt = concept['instance_prompt']
+            caption_dir = concept.get('caption_dir')
+            mask_dir = concept.get('mask_dir')
+            instance_prompt = self.process_text(instance_prompt, replace_mapping)
+            inst_img_path = []
+            for x in Path(concept['instance_data_dir']).iterdir():
+                if x.is_file() and x.name != '.DS_Store':
+                    basename = os.path.splitext(os.path.basename(x))[0]
+                    caption_path = os.path.join(caption_dir, f'{basename}.txt') if caption_dir is not None else None
+                    if use_caption and caption_path is not None and os.path.exists(caption_path):
+                        with open(caption_path, 'r') as fr:
+                            line = fr.readlines()[0]
+                            instance_prompt_image = self.process_text(line, replace_mapping)
+                    else:
+                        instance_prompt_image = instance_prompt
+                    if use_mask and mask_dir is not None:
+                        mask_path = os.path.join(mask_dir, f'{basename}.png')
+                    else:
+                        mask_path = None
+                    inst_img_path.append((x, instance_prompt_image, mask_path))
+            self.instance_images_path.extend(inst_img_path)
+        random.shuffle(self.instance_images_path)
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_transform = PairCompose([
+            build_transform(transform_opt)
+            for transform_opt in opt['instance_transform']
+        ])
+    def process_text(self, instance_prompt, replace_mapping):
+        for k, v in replace_mapping.items():
+            instance_prompt = instance_prompt.replace(k, v)
+        instance_prompt = instance_prompt.strip()
+        instance_prompt = re.sub(' +', ' ', instance_prompt)
+        return instance_prompt
+    def __len__(self):
+        return self.num_instance_images * self.opt['dataset_enlarge_ratio']
+    def __getitem__(self, index):
+        example = {}
+        instance_image, instance_prompt, instance_mask = self.instance_images_path[index % self.num_instance_images]
+        instance_image = Image.open(instance_image).convert('RGB')
+        extra_args = {'prompts': instance_prompt}
+        if instance_mask is not None:
+            instance_mask = Image.open(instance_mask).convert('L')
+            extra_args.update({'mask': instance_mask})
+        instance_image, extra_args = self.instance_transform(instance_image, **extra_args)
+        example['images'] = instance_image
+        if 'mask' in extra_args:
+            example['masks'] = extra_args['mask']
+            example['masks'] = example['masks'].unsqueeze(0)
+        else:
+            pass
+        if 'img_mask' in extra_args:
+            example['img_masks'] = extra_args['img_mask']
+            example['img_masks'] = example['img_masks'].unsqueeze(0)
+        else:
+            raise NotImplementedError
+        example['prompts'] = extra_args['prompts']
+        return example

mixofshow/data/pil_transform.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import inspect
+import random
+from copy import deepcopy
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.transforms.functional as F
+from PIL import Image
+from torchvision.transforms import CenterCrop, Normalize, RandomCrop, RandomHorizontalFlip, Resize
+from torchvision.transforms.functional import InterpolationMode
+from mixofshow.utils.registry import TRANSFORM_REGISTRY
+def build_transform(opt):
+    """Build performance evaluator from options.
+    Args:
+        opt (dict): Configuration.
+    """
+    opt = deepcopy(opt)
+    transform_type = opt.pop('type')
+    transform = TRANSFORM_REGISTRY.get(transform_type)(**opt)
+    return transform
+TRANSFORM_REGISTRY.register(Normalize)
+TRANSFORM_REGISTRY.register(Resize)
+TRANSFORM_REGISTRY.register(RandomHorizontalFlip)
+TRANSFORM_REGISTRY.register(CenterCrop)
+TRANSFORM_REGISTRY.register(RandomCrop)
+@TRANSFORM_REGISTRY.register()
+class BILINEARResize(Resize):
+    def __init__(self, size):
+        super(BILINEARResize,
+              self).__init__(size, interpolation=InterpolationMode.BILINEAR)
+@TRANSFORM_REGISTRY.register()
+class PairRandomCrop(nn.Module):
+    def __init__(self, size):
+        super().__init__()
+        if isinstance(size, int):
+            self.height, self.width = size, size
+        else:
+            self.height, self.width = size
+    def forward(self, img, **kwargs):
+        img_width, img_height = img.size
+        mask_width, mask_height = kwargs['mask'].size
+        assert img_height >= self.height and img_height == mask_height
+        assert img_width >= self.width and img_width == mask_width
+        x = random.randint(0, img_width - self.width)
+        y = random.randint(0, img_height - self.height)
+        img = F.crop(img, y, x, self.height, self.width)
+        kwargs['mask'] = F.crop(kwargs['mask'], y, x, self.height, self.width)
+        return img, kwargs
+@TRANSFORM_REGISTRY.register()
+class ToTensor(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(self, pic):
+        return F.to_tensor(pic)
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}()'
+@TRANSFORM_REGISTRY.register()
+class PairRandomHorizontalFlip(torch.nn.Module):
+    def __init__(self, p=0.5):
+        super().__init__()
+        self.p = p
+    def forward(self, img, **kwargs):
+        if torch.rand(1) < self.p:
+            kwargs['mask'] = F.hflip(kwargs['mask'])
+            return F.hflip(img), kwargs
+        return img, kwargs
+@TRANSFORM_REGISTRY.register()
+class PairResize(nn.Module):
+    def __init__(self, size):
+        super().__init__()
+        self.resize = Resize(size=size)
+    def forward(self, img, **kwargs):
+        kwargs['mask'] = self.resize(kwargs['mask'])
+        img = self.resize(img)
+        return img, kwargs
+class PairCompose(nn.Module):
+    def __init__(self, transforms):
+        super().__init__()
+        self.transforms = transforms
+    def __call__(self, img, **kwargs):
+        for t in self.transforms:
+            if len(inspect.signature(t.forward).parameters
+                   ) == 1:  # count how many args, not count self
+                img = t(img)
+            else:
+                img, kwargs = t(img, **kwargs)
+        return img, kwargs
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
+@TRANSFORM_REGISTRY.register()
+class HumanResizeCropFinalV3(nn.Module):
+    def __init__(self, size, crop_p=0.5):
+        super().__init__()
+        self.size = size
+        self.crop_p = crop_p
+        self.random_crop = RandomCrop(size=size)
+        self.paired_random_crop = PairRandomCrop(size=size)
+    def forward(self, img, **kwargs):
+        # step 1: short edge resize to 512
+        img = F.resize(img, size=self.size)
+        if 'mask' in kwargs:
+            kwargs['mask'] = F.resize(kwargs['mask'], size=self.size)
+        # step 2: random crop
+        width, height = img.size
+        if random.random() < self.crop_p:
+            if height > width:
+                crop_pos = random.randint(0, height - width)
+                img = F.crop(img, 0, 0, width + crop_pos, width)
+                if 'mask' in kwargs:
+                    kwargs['mask'] = F.crop(kwargs['mask'], 0, 0, width + crop_pos, width)
+            else:
+                if 'mask' in kwargs:
+                    img, kwargs = self.paired_random_crop(img, **kwargs)
+                else:
+                    img = self.random_crop(img)
+        else:
+            img = img
+        # step 3: long edge resize
+        img = F.resize(img, size=self.size - 1, max_size=self.size)
+        if 'mask' in kwargs:
+            kwargs['mask'] = F.resize(kwargs['mask'], size=self.size - 1, max_size=self.size)
+        new_width, new_height = img.size
+        img = np.array(img)
+        if 'mask' in kwargs:
+            kwargs['mask'] = np.array(kwargs['mask']) / 255
+            new_width = min(new_width, kwargs['mask'].shape[1])
+            new_height = min(new_height, kwargs['mask'].shape[0])
+        start_y = random.randint(0, 512 - new_height)
+        start_x = random.randint(0, 512 - new_width)
+        res_img = np.zeros((self.size, self.size, 3), dtype=np.uint8)
+        res_mask = np.zeros((self.size, self.size))
+        res_img_mask = np.zeros((self.size, self.size))
+        res_img[start_y:start_y + new_height, start_x:start_x + new_width, :] = img[:new_height, :new_width]
+        if 'mask' in kwargs:
+            res_mask[start_y:start_y + new_height, start_x:start_x + new_width] = kwargs['mask'][:new_height, :new_width]
+            kwargs['mask'] = res_mask
+        res_img_mask[start_y:start_y + new_height, start_x:start_x + new_width] = 1
+        kwargs['img_mask'] = res_img_mask
+        img = Image.fromarray(res_img)
+        if 'mask' in kwargs:
+            kwargs['mask'] = cv2.resize(kwargs['mask'], (self.size // 8, self.size // 8), cv2.INTER_NEAREST)
+            kwargs['mask'] = torch.from_numpy(kwargs['mask'])
+        kwargs['img_mask'] = cv2.resize(kwargs['img_mask'], (self.size // 8, self.size // 8), cv2.INTER_NEAREST)
+        kwargs['img_mask'] = torch.from_numpy(kwargs['img_mask'])
+        return img, kwargs
+@TRANSFORM_REGISTRY.register()
+class ResizeFillMaskNew(nn.Module):
+    def __init__(self, size, crop_p, scale_ratio):
+        super().__init__()
+        self.size = size
+        self.crop_p = crop_p
+        self.scale_ratio = scale_ratio
+        self.random_crop = RandomCrop(size=size)
+        self.paired_random_crop = PairRandomCrop(size=size)
+    def forward(self, img, **kwargs):
+        # width, height = img.size
+        # step 1: short edge resize to 512
+        img = F.resize(img, size=self.size)
+        if 'mask' in kwargs:
+            kwargs['mask'] = F.resize(kwargs['mask'], size=self.size)
+        # step 2: random crop
+        if random.random() < self.crop_p:
+            if 'mask' in kwargs:
+                img, kwargs = self.paired_random_crop(img, **kwargs)  # 51
+            else:
+                img = self.random_crop(img)  # 512
+        else:
+            # long edge resize
+            img = F.resize(img, size=self.size - 1, max_size=self.size)
+            if 'mask' in kwargs:
+                kwargs['mask'] = F.resize(kwargs['mask'], size=self.size - 1, max_size=self.size)
+        # step 3: random aspect ratio
+        width, height = img.size
+        ratio = random.uniform(*self.scale_ratio)
+        img = F.resize(img, size=(int(height * ratio), int(width * ratio)))
+        if 'mask' in kwargs:
+            kwargs['mask'] = F.resize(kwargs['mask'], size=(int(height * ratio), int(width * ratio)), interpolation=0)
+        # step 4: random place
+        new_width, new_height = img.size
+        img = np.array(img)
+        if 'mask' in kwargs:
+            kwargs['mask'] = np.array(kwargs['mask']) / 255
+        start_y = random.randint(0, 512 - new_height)
+        start_x = random.randint(0, 512 - new_width)
+        res_img = np.zeros((self.size, self.size, 3), dtype=np.uint8)
+        res_mask = np.zeros((self.size, self.size))
+        res_img_mask = np.zeros((self.size, self.size))
+        res_img[start_y:start_y + new_height, start_x:start_x + new_width, :] = img
+        if 'mask' in kwargs:
+            res_mask[start_y:start_y + new_height, start_x:start_x + new_width] = kwargs['mask']
+            kwargs['mask'] = res_mask
+        res_img_mask[start_y:start_y + new_height, start_x:start_x + new_width] = 1
+        kwargs['img_mask'] = res_img_mask
+        img = Image.fromarray(res_img)
+        if 'mask' in kwargs:
+            kwargs['mask'] = cv2.resize(kwargs['mask'], (self.size // 8, self.size // 8), cv2.INTER_NEAREST)
+            kwargs['mask'] = torch.from_numpy(kwargs['mask'])
+        kwargs['img_mask'] = cv2.resize(kwargs['img_mask'], (self.size // 8, self.size // 8), cv2.INTER_NEAREST)
+        kwargs['img_mask'] = torch.from_numpy(kwargs['img_mask'])
+        return img, kwargs
+@TRANSFORM_REGISTRY.register()
+class ShuffleCaption(nn.Module):
+    def __init__(self, keep_token_num):
+        super().__init__()
+        self.keep_token_num = keep_token_num
+    def forward(self, img, **kwargs):
+        prompts = kwargs['prompts'].strip()
+        fixed_tokens = []
+        flex_tokens = [t.strip() for t in prompts.strip().split(',')]
+        if self.keep_token_num > 0:
+            fixed_tokens = flex_tokens[:self.keep_token_num]
+            flex_tokens = flex_tokens[self.keep_token_num:]
+        random.shuffle(flex_tokens)
+        prompts = ', '.join(fixed_tokens + flex_tokens)
+        kwargs['prompts'] = prompts
+        return img, kwargs
+@TRANSFORM_REGISTRY.register()
+class EnhanceText(nn.Module):
+    def __init__(self, enhance_type='object'):
+        super().__init__()
+        STYLE_TEMPLATE = [
+            'a painting in the style of {}',
+            'a rendering in the style of {}',
+            'a cropped painting in the style of {}',
+            'the painting in the style of {}',
+            'a clean painting in the style of {}',
+            'a dirty painting in the style of {}',
+            'a dark painting in the style of {}',
+            'a picture in the style of {}',
+            'a cool painting in the style of {}',
+            'a close-up painting in the style of {}',
+            'a bright painting in the style of {}',
+            'a cropped painting in the style of {}',
+            'a good painting in the style of {}',
+            'a close-up painting in the style of {}',
+            'a rendition in the style of {}',
+            'a nice painting in the style of {}',
+            'a small painting in the style of {}',
+            'a weird painting in the style of {}',
+            'a large painting in the style of {}',
+        ]
+        OBJECT_TEMPLATE = [
+            'a photo of a {}',
+            'a rendering of a {}',
+            'a cropped photo of the {}',
+            'the photo of a {}',
+            'a photo of a clean {}',
+            'a photo of a dirty {}',
+            'a dark photo of the {}',
+            'a photo of my {}',
+            'a photo of the cool {}',
+            'a close-up photo of a {}',
+            'a bright photo of the {}',
+            'a cropped photo of a {}',
+            'a photo of the {}',
+            'a good photo of the {}',
+            'a photo of one {}',
+            'a close-up photo of the {}',
+            'a rendition of the {}',
+            'a photo of the clean {}',
+            'a rendition of a {}',
+            'a photo of a nice {}',
+            'a good photo of a {}',
+            'a photo of the nice {}',
+            'a photo of the small {}',
+            'a photo of the weird {}',
+            'a photo of the large {}',
+            'a photo of a cool {}',
+            'a photo of a small {}',
+        ]
+        HUMAN_TEMPLATE = [
+            'a photo of a {}', 'a photo of one {}', 'a photo of the {}',
+            'the photo of a {}', 'a rendering of a {}',
+            'a rendition of the {}', 'a rendition of a {}',
+            'a cropped photo of the {}', 'a cropped photo of a {}',
+            'a bad photo of the {}', 'a bad photo of a {}',
+            'a photo of a weird {}', 'a weird photo of a {}',
+            'a bright photo of the {}', 'a good photo of the {}',
+            'a photo of a nice {}', 'a good photo of a {}',
+            'a photo of a cool {}', 'a bright photo of the {}'
+        ]
+        if enhance_type == 'object':
+            self.templates = OBJECT_TEMPLATE
+        elif enhance_type == 'style':
+            self.templates = STYLE_TEMPLATE
+        elif enhance_type == 'human':
+            self.templates = HUMAN_TEMPLATE
+        else:
+            raise NotImplementedError
+    def forward(self, img, **kwargs):
+        concept_token = kwargs['prompts'].strip()
+        kwargs['prompts'] = random.choice(self.templates).format(concept_token)
+        return img, kwargs

mixofshow/data/prompt_dataset.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import random
+import re
+import torch
+from torch.utils.data import Dataset
+class PromptDataset(Dataset):
+    'A simple dataset to prepare the prompts to generate class images on multiple GPUs.'
+    def __init__(self, opt):
+        self.opt = opt
+        self.prompts = opt['prompts']
+        if isinstance(self.prompts, list):
+            self.prompts = self.prompts
+        elif os.path.exists(self.prompts):
+            # is file
+            with open(self.prompts, 'r') as fr:
+                lines = fr.readlines()
+                lines = [item.strip() for item in lines]
+            self.prompts = lines
+        else:
+            raise ValueError(
+                'prompts should be a prompt file path or prompt list, please check!'
+            )
+        self.prompts = self.replace_placeholder(self.prompts)
+        self.num_samples_per_prompt = opt['num_samples_per_prompt']
+        self.prompts_to_generate = [
+            (p, i) for i in range(1, self.num_samples_per_prompt + 1)
+            for p in self.prompts
+        ]
+        self.latent_size = opt['latent_size']  # (4,64,64)
+        self.share_latent_across_prompt = opt.get('share_latent_across_prompt', True)  # (true, false)
+    def replace_placeholder(self, prompts):
+        # replace placehold token
+        replace_mapping = self.opt.get('replace_mapping', {})
+        new_lines = []
+        for line in self.prompts:
+            if len(line.strip()) == 0:
+                continue
+            for k, v in replace_mapping.items():
+                line = line.replace(k, v)
+            line = line.strip()
+            line = re.sub(' +', ' ', line)
+            new_lines.append(line)
+        return new_lines
+    def __len__(self):
+        return len(self.prompts_to_generate)
+    def __getitem__(self, index):
+        prompt, indice = self.prompts_to_generate[index]
+        example = {}
+        example['prompts'] = prompt
+        example['indices'] = indice
+        if self.share_latent_across_prompt:
+            seed = indice
+        else:
+            seed = random.randint(0, 1000)
+        example['latents'] = torch.randn(self.latent_size, generator=torch.manual_seed(seed))
+        return example

mixofshow/models/__pycache__/edlora.cpython-310.pyc ADDED Viewed

Binary file (6.96 kB). View file

mixofshow/models/__pycache__/edlora.cpython-38.pyc ADDED Viewed

Binary file (6.96 kB). View file

mixofshow/models/__pycache__/edlora.cpython-39.pyc ADDED Viewed

Binary file (6.95 kB). View file

mixofshow/models/edlora.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusers.models.attention_processor import AttnProcessor
+from diffusers.utils.import_utils import is_xformers_available
+if is_xformers_available():
+    import xformers
+def remove_edlora_unet_attention_forward(unet):
+    def change_forward(unet):  # omit proceesor in new diffusers
+        for name, layer in unet.named_children():
+            if layer.__class__.__name__ == 'Attention' and name == 'attn2':
+                layer.set_processor(AttnProcessor())
+            else:
+                change_forward(layer)
+    change_forward(unet)
+class EDLoRA_Control_AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(self, cross_attention_idx, place_in_unet, controller, attention_op=None):
+        self.cross_attention_idx = cross_attention_idx
+        self.place_in_unet = place_in_unet
+        self.controller = controller
+        self.attention_op = attention_op
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        if encoder_hidden_states is None:
+            is_cross = False
+            encoder_hidden_states = hidden_states
+        else:
+            is_cross = True
+            if len(encoder_hidden_states.shape) == 4:  # multi-layer embedding
+                encoder_hidden_states = encoder_hidden_states[:, self.cross_attention_idx, ...]
+            else:  # single layer embedding
+                encoder_hidden_states = encoder_hidden_states
+        assert not attn.norm_cross
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        if is_xformers_available() and not is_cross:
+            hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            attention_probs = self.controller(attention_probs, is_cross, self.place_in_unet)
+            hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class EDLoRA_AttnProcessor:
+    def __init__(self, cross_attention_idx, attention_op=None):
+        self.attention_op = attention_op
+        self.cross_attention_idx = cross_attention_idx
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            if len(encoder_hidden_states.shape) == 4:  # multi-layer embedding
+                encoder_hidden_states = encoder_hidden_states[:, self.cross_attention_idx, ...]
+            else:  # single layer embedding
+                encoder_hidden_states = encoder_hidden_states
+        assert not attn.norm_cross
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        if is_xformers_available():
+            hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def revise_edlora_unet_attention_forward(unet):
+    def change_forward(unet, count):
+        for name, layer in unet.named_children():
+            if layer.__class__.__name__ == 'Attention' and 'attn2' in name:
+                layer.set_processor(EDLoRA_AttnProcessor(count))
+                count += 1
+            else:
+                count = change_forward(layer, count)
+        return count
+    # use this to ensure the order
+    cross_attention_idx = change_forward(unet.down_blocks, 0)
+    cross_attention_idx = change_forward(unet.mid_block, cross_attention_idx)
+    cross_attention_idx = change_forward(unet.up_blocks, cross_attention_idx)
+    print(f'Number of attention layer registered {cross_attention_idx}')
+def revise_edlora_unet_attention_controller_forward(unet, controller):
+    class DummyController:
+        def __call__(self, *args):
+            return args[0]
+        def __init__(self):
+            self.num_att_layers = 0
+    if controller is None:
+        controller = DummyController()
+    def change_forward(unet, count, place_in_unet):
+        for name, layer in unet.named_children():
+            if layer.__class__.__name__ == 'Attention' and 'attn2' in name:  # only register controller for cross-attention
+                layer.set_processor(EDLoRA_Control_AttnProcessor(count, place_in_unet, controller))
+                count += 1
+            else:
+                count = change_forward(layer, count, place_in_unet)
+        return count
+    # use this to ensure the order
+    cross_attention_idx = change_forward(unet.down_blocks, 0, 'down')
+    cross_attention_idx = change_forward(unet.mid_block, cross_attention_idx, 'mid')
+    cross_attention_idx = change_forward(unet.up_blocks, cross_attention_idx, 'up')
+    print(f'Number of attention layer registered {cross_attention_idx}')
+    controller.num_att_layers = cross_attention_idx
+class LoRALinearLayer(nn.Module):
+    def __init__(self, name, original_module, rank=4, alpha=1):
+        super().__init__()
+        self.name = name
+        ### Hard coded LoRA rank
+        rank = 32
+        if original_module.__class__.__name__ == 'Conv2d':
+            in_channels, out_channels = original_module.in_channels, original_module.out_channels
+            self.lora_down = torch.nn.Conv2d(in_channels, rank, (1, 1), bias=False)
+            self.lora_up = torch.nn.Conv2d(rank, out_channels, (1, 1), bias=False)
+        else:
+            in_features, out_features = original_module.in_features, original_module.out_features
+            self.lora_down = nn.Linear(in_features, rank, bias=False)
+            self.lora_up = nn.Linear(rank, out_features, bias=False)
+        self.register_buffer('alpha', torch.tensor(alpha))
+        ### Load and initialize orthogonal B
+        m = np.load(f"orthogonal_mats/{in_features}.npy")
+        idxs = np.random.choice(in_features, size = rank, replace = False)
+        m = m[idxs]/2
+        with torch.no_grad():
+            self.lora_down.weight = torch.nn.Parameter(torch.tensor(m, dtype = self.lora_down.weight.dtype))
+        torch.nn.init.zeros_(self.lora_up.weight)
+        for param in self.lora_down.parameters():
+            param.requires_grad = False
+        self.original_forward = original_module.forward
+        original_module.forward = self.forward
+    def forward(self, hidden_states):
+        hidden_states = self.original_forward(hidden_states) + self.alpha * self.lora_up(self.lora_down(hidden_states))
+        return hidden_states

mixofshow/pipelines/__pycache__/pipeline_edlora.cpython-310.pyc ADDED Viewed

Binary file (8.81 kB). View file

mixofshow/pipelines/__pycache__/pipeline_edlora.cpython-38.pyc ADDED Viewed

Binary file (8.69 kB). View file

mixofshow/pipelines/__pycache__/pipeline_edlora.cpython-39.pyc ADDED Viewed

Binary file (8.7 kB). View file

mixofshow/pipelines/__pycache__/pipeline_regionally_t2iadapter.cpython-310.pyc ADDED Viewed

Binary file (19.1 kB). View file

mixofshow/pipelines/__pycache__/pipeline_regionally_t2iadapter.cpython-38.pyc ADDED Viewed

Binary file (19 kB). View file

mixofshow/pipelines/__pycache__/pipeline_regionally_t2iadapter.cpython-39.pyc ADDED Viewed

Binary file (19 kB). View file

mixofshow/pipelines/__pycache__/trainer_edlora.cpython-38.pyc ADDED Viewed

Binary file (10.9 kB). View file

mixofshow/pipelines/__pycache__/trainer_edlora.cpython-39.pyc ADDED Viewed

Binary file (10.9 kB). View file

mixofshow/pipelines/pipeline_edlora.py ADDED Viewed

	@@ -0,0 +1,322 @@

+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate
+from einops import rearrange
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer
+from mixofshow.models.edlora import (revise_edlora_unet_attention_controller_forward,
+                                     revise_edlora_unet_attention_forward)
+def bind_concept_prompt(prompts, new_concept_cfg):
+    if isinstance(prompts, str):
+        prompts = [prompts]
+    new_prompts = []
+    for prompt in prompts:
+        prompt = [prompt] * 16
+        for concept_name, new_token_cfg in new_concept_cfg.items():
+            prompt = [
+                p.replace(concept_name, new_name) for p, new_name in zip(prompt, new_token_cfg['concept_token_names'])
+            ]
+        new_prompts.extend(prompt)
+    return new_prompts
+class EDLoRAPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker: bool = False,
+    ):
+        if hasattr(scheduler.config, 'steps_offset') and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f'The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`'
+                f' should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure '
+                'to update the config accordingly as leaving `steps_offset` might led to incorrect results'
+                ' in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,'
+                ' it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`'
+                ' file'
+            )
+            deprecate('steps_offset!=1', '1.0.0', deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config['steps_offset'] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, 'clip_sample') and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f'The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`.'
+                ' `clip_sample` should be set to False in the configuration file. Please make sure to update the'
+                ' config accordingly as not setting `clip_sample` in the config might lead to incorrect results in'
+                ' future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very'
+                ' nice if you could open a Pull request for the `scheduler/scheduler_config.json` file'
+            )
+            deprecate('clip_sample not set', '1.0.0', deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config['clip_sample'] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, '_diffusers_version') and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse('0.9.0.dev0')
+        is_unet_sample_size_less_64 = hasattr(unet.config, 'sample_size') and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                'The configuration file of the unet has set the default `sample_size` to smaller than'
+                ' 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the'
+                ' following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-'
+                ' CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5'
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                ' configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`'
+                ' in the config might lead to incorrect results in future versions. If you have downloaded this'
+                ' checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for'
+                ' the `unet/config.json` file'
+            )
+            deprecate('sample_size<64', '1.0.0', deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config['sample_size'] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        revise_edlora_unet_attention_forward(unet)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.new_concept_cfg = None
+    def set_new_concept_cfg(self, new_concept_cfg=None):
+        self.new_concept_cfg = new_concept_cfg
+    def set_controller(self, controller):
+        self.controller = controller
+        revise_edlora_unet_attention_controller_forward(self.unet, controller)
+    def _encode_prompt(self,
+        prompt,
+        new_concept_cfg,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None
+    ):
+        assert num_images_per_prompt == 1, 'only support num_images_per_prompt=1 now'
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_extend = bind_concept_prompt(prompt, new_concept_cfg)
+            text_inputs = self.tokenizer(
+                prompt_extend,
+                padding='max_length',
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors='pt',
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+            prompt_embeds = rearrange(prompt_embeds, '(b n) m c -> b n m c', b=batch_size)
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        bs_embed, layer_num, seq_len, _ = prompt_embeds.shape
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [''] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(f'`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !='
+                                f' {type(prompt)}.')
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f'`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:'
+                    f' {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches'
+                    ' the batch size of `prompt`.')
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding='max_length',
+                max_length=seq_len,
+                truncation=True,
+                return_tensors='pt',
+            )
+            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(device))[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = (negative_prompt_embeds).view(batch_size, 1, seq_len, -1).repeat(1, layer_num, 1, 1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = 'pil',
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt, this support pplus and edlora (layer-wise embedding)
+        assert self.new_concept_cfg is not None
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            self.new_concept_cfg,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                if hasattr(self, 'controller'):
+                    dtype = latents.dtype
+                    latents = self.controller.step_callback(latents)
+                    latents = latents.to(dtype)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == 'latent':
+            image = latents
+        elif output_type == 'pil':
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+        # Offload last model to CPU
+        if hasattr(self, 'final_offload_hook') and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)

mixofshow/pipelines/pipeline_regionally_t2iadapter.py ADDED Viewed

	@@ -0,0 +1,608 @@

+import math
+from typing import Any, Callable, Dict, List, Optional, Union
+import PIL
+import torch
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter import (StableDiffusionAdapterPipeline,
+                                                                               StableDiffusionAdapterPipelineOutput,
+                                                                               _preprocess_adapter_image)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange
+from torch import einsum
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+if is_xformers_available():
+    import xformers
+from mixofshow.pipelines.pipeline_edlora import bind_concept_prompt
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class RegionT2I_AttnProcessor:
+    def __init__(self, cross_attention_idx, attention_op=None):
+        self.attention_op = attention_op
+        self.cross_attention_idx = cross_attention_idx
+    def region_rewrite(self, attn, hidden_states, query, region_list, height, width):
+        def get_region_mask(region_list, feat_height, feat_width):
+            exclusive_mask = torch.zeros((feat_height, feat_width))
+            for region in region_list:
+                start_h, start_w, end_h, end_w = region[-1]
+                start_h, start_w, end_h, end_w = math.ceil(start_h * feat_height), math.ceil(
+                    start_w * feat_width), math.floor(end_h * feat_height), math.floor(end_w * feat_width)
+                exclusive_mask[start_h:end_h, start_w:end_w] += 1
+            return exclusive_mask
+        dtype = query.dtype
+        seq_lens = query.shape[1]
+        downscale = math.sqrt(height * width / seq_lens)
+        # 0: context >=1: may be overlap
+        feat_height, feat_width = int(height // downscale), int(width // downscale)
+        region_mask = get_region_mask(region_list, feat_height, feat_width)
+        query = rearrange(query, 'b (h w) c -> b h w c', h=feat_height, w=feat_width)
+        hidden_states = rearrange(hidden_states, 'b (h w) c -> b h w c', h=feat_height, w=feat_width)
+        new_hidden_state = torch.zeros_like(hidden_states)
+        new_hidden_state[:, region_mask == 0, :] = hidden_states[:, region_mask == 0, :]
+        replace_ratio = 1.0
+        new_hidden_state[:, region_mask != 0, :] = (1 - replace_ratio) * hidden_states[:, region_mask != 0, :]
+        for region in region_list:
+            region_key, region_value, region_box = region
+            if attn.upcast_attention:
+                query = query.float()
+                region_key = region_key.float()
+            start_h, start_w, end_h, end_w = region_box
+            start_h, start_w, end_h, end_w = math.ceil(start_h * feat_height), math.ceil(
+                start_w * feat_width), math.floor(end_h * feat_height), math.floor(end_w * feat_width)
+            attention_region = einsum('b h w c, b n c -> b h w n', query[:, start_h:end_h, start_w:end_w, :], region_key) * attn.scale
+            if attn.upcast_softmax:
+                attention_region = attention_region.float()
+            attention_region = attention_region.softmax(dim=-1)
+            attention_region = attention_region.to(dtype)
+            hidden_state_region = einsum('b h w n, b n c -> b h w c', attention_region, region_value)
+            new_hidden_state[:, start_h:end_h, start_w:end_w, :] += \
+                replace_ratio * (hidden_state_region / (
+                    region_mask.reshape(
+                        1, *region_mask.shape, 1)[:, start_h:end_h, start_w:end_w, :]
+                ).to(query.device))
+        new_hidden_state = rearrange(new_hidden_state, 'b h w c -> b (h w) c')
+        return new_hidden_state
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None, **cross_attention_kwargs):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            is_cross = False
+            encoder_hidden_states = hidden_states
+        else:
+            is_cross = True
+            if len(encoder_hidden_states.shape) == 4:  # multi-layer embedding
+                encoder_hidden_states = encoder_hidden_states[:, self.cross_attention_idx, ...]
+            else:
+                encoder_hidden_states = encoder_hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        if is_xformers_available() and not is_cross:
+            hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        if is_cross:
+            region_list = []
+            for region in cross_attention_kwargs['region_list']:
+                if len(region[0].shape) == 4:
+                    region_key = attn.to_k(region[0][:, self.cross_attention_idx, ...])
+                    region_value = attn.to_v(region[0][:, self.cross_attention_idx, ...])
+                else:
+                    region_key = attn.to_k(region[0])
+                    region_value = attn.to_v(region[0])
+                region_key = attn.head_to_batch_dim(region_key)
+                region_value = attn.head_to_batch_dim(region_value)
+                region_list.append((region_key, region_value, region[1]))
+            hidden_states = self.region_rewrite(
+                attn=attn,
+                hidden_states=hidden_states,
+                query=query,
+                region_list=region_list,
+                height=cross_attention_kwargs['height'],
+                width=cross_attention_kwargs['width'])
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+def revise_regionally_t2iadapter_attention_forward(unet):
+    def change_forward(unet, count):
+        for name, layer in unet.named_children():
+            if layer.__class__.__name__ == 'Attention':
+                layer.set_processor(RegionT2I_AttnProcessor(count))
+                if 'attn2' in name:
+                    count += 1
+            else:
+                count = change_forward(layer, count)
+        return count
+    # use this to ensure the order
+    cross_attention_idx = change_forward(unet.down_blocks, 0)
+    cross_attention_idx = change_forward(unet.mid_block, cross_attention_idx)
+    cross_attention_idx = change_forward(unet.up_blocks, cross_attention_idx)
+    print(f'Number of attention layer registered {cross_attention_idx}')
+class RegionallyT2IAdapterPipeline(StableDiffusionAdapterPipeline):
+    _optional_components = ['safety_checker', 'feature_extractor']
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = False,
+    ):
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f'You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure'
+                ' that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered'
+                ' results in services or applications open to the public. Both the diffusers team and Hugging Face'
+                ' strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling'
+                ' it only for use-cases that involve analyzing network behavior or auditing its results. For more'
+                ' information, please have a look at https://github.com/huggingface/diffusers/pull/254 .'
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                'Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety'
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.new_concept_cfg = None
+        revise_regionally_t2iadapter_attention_forward(self.unet)
+    def set_new_concept_cfg(self, new_concept_cfg=None):
+        self.new_concept_cfg = new_concept_cfg
+    def _encode_region_prompt(self,
+        prompt,
+        new_concept_cfg,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        height=512,
+        width=512
+    ):
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        assert batch_size == 1, 'only sample one prompt once in this version'
+        if prompt_embeds is None:
+            context_prompt, region_list = prompt[0][0], prompt[0][1]
+            context_prompt = bind_concept_prompt([context_prompt], new_concept_cfg)
+            context_prompt_input_ids = self.tokenizer(
+                context_prompt,
+                padding='max_length',
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors='pt',
+            ).input_ids
+            prompt_embeds = self.text_encoder(context_prompt_input_ids.to(device), attention_mask=None)[0]
+            prompt_embeds = rearrange(prompt_embeds, '(b n) m c -> b n m c', b=batch_size)
+            prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            bs_embed, layer_num, seq_len, _ = prompt_embeds.shape
+            if negative_prompt is None:
+                negative_prompt = [''] * batch_size
+            negative_prompt_input_ids = self.tokenizer(
+                negative_prompt,
+                padding='max_length',
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors='pt').input_ids
+            negative_prompt_embeds = self.text_encoder(
+                negative_prompt_input_ids.to(device),
+                attention_mask=None,
+            )[0]
+            negative_prompt_embeds = (negative_prompt_embeds).view(batch_size, 1, seq_len, -1).repeat(1, layer_num, 1, 1)
+            negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            for idx, region in enumerate(region_list):
+                region_prompt, region_neg_prompt, pos = region
+                region_prompt = bind_concept_prompt([region_prompt], new_concept_cfg)
+                region_prompt_input_ids = self.tokenizer(
+                    region_prompt,
+                    padding='max_length',
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors='pt').input_ids
+                region_embeds = self.text_encoder(region_prompt_input_ids.to(device), attention_mask=None)[0]
+                region_embeds = rearrange(region_embeds, '(b n) m c -> b n m c', b=batch_size)
+                region_embeds.to(dtype=self.text_encoder.dtype, device=device)
+                bs_embed, layer_num, seq_len, _ = region_embeds.shape
+                if region_neg_prompt is None:
+                    region_neg_prompt = [''] * batch_size
+                region_negprompt_input_ids = self.tokenizer(
+                    region_neg_prompt,
+                    padding='max_length',
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors='pt').input_ids
+                region_neg_embeds = self.text_encoder(region_negprompt_input_ids.to(device), attention_mask=None)[0]
+                region_neg_embeds = (region_neg_embeds).view(batch_size, 1, seq_len, -1).repeat(1, layer_num, 1, 1)
+                region_neg_embeds.to(dtype=self.text_encoder.dtype, device=device)
+                region_list[idx] = (torch.cat([region_neg_embeds, region_embeds]), pos)
+        return prompt_embeds, region_list
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        keypose_adapter_input: Union[torch.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+        keypose_adaptor_weight=1.0,
+        region_keypose_adaptor_weight='',
+        sketch_adapter_input: Union[torch.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+        sketch_adaptor_weight=1.0,
+        region_sketch_adaptor_weight='',
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = 'pil',
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
+                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
+                type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
+                accepted as an image. The control image is automatically resized to fit the output image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images, and the second
+            element is a list of `bool`s denoting whether the corresponding generated image likely represents
+            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        device = self._execution_device
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        if keypose_adapter_input is not None:
+            keypose_input = _preprocess_adapter_image(keypose_adapter_input, height, width).to(self.device)
+            keypose_input = keypose_input.to(self.keypose_adapter.dtype)
+        else:
+            keypose_input = None
+        if sketch_adapter_input is not None:
+            sketch_input = _preprocess_adapter_image(sketch_adapter_input, height, width).to(self.device)
+            sketch_input = sketch_input.to(self.sketch_adapter.dtype)
+        else:
+            sketch_input = None
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        assert self.new_concept_cfg is not None
+        prompt_embeds, region_list = self._encode_region_prompt(
+            prompt,
+            self.new_concept_cfg,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            height=height,
+            width=width
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        if keypose_input is not None:
+            keypose_adapter_state = self.keypose_adapter(keypose_input)
+        else:
+            keypose_adapter_state = None
+        if sketch_input is not None:
+            sketch_adapter_state = self.sketch_adapter(sketch_input)
+        else:
+            sketch_adapter_state = None
+        num_states = len(keypose_adapter_state) if keypose_adapter_state is not None else len(sketch_adapter_state)
+        adapter_state = []
+        for idx in range(num_states):
+            if keypose_adapter_state is not None:
+                feat_keypose = keypose_adapter_state[idx]
+                spatial_adaptor_weight = keypose_adaptor_weight * torch.ones(*feat_keypose.shape[2:]).to(
+                    feat_keypose.dtype).to(feat_keypose.device)
+                if region_keypose_adaptor_weight != '':
+                    region_list = region_keypose_adaptor_weight.split('|')
+                    for region_weight in region_list:
+                        region, weight = region_weight.split('-')
+                        region = eval(region)
+                        weight = eval(weight)
+                        feat_height, feat_width = feat_keypose.shape[2:]
+                        start_h, start_w, end_h, end_w = region
+                        start_h, end_h = start_h / height, end_h / height
+                        start_w, end_w = start_w / width, end_w / width
+                        start_h, start_w, end_h, end_w = math.ceil(start_h * feat_height), math.ceil(
+                            start_w * feat_width), math.floor(end_h * feat_height), math.floor(end_w * feat_width)
+                        spatial_adaptor_weight[start_h:end_h, start_w:end_w] = weight
+                feat_keypose = spatial_adaptor_weight * feat_keypose
+            else:
+                feat_keypose = 0
+            if sketch_adapter_state is not None:
+                feat_sketch = sketch_adapter_state[idx]
+                # print(feat_keypose.shape) # torch.Size([1, 320, 64, 128])
+                spatial_adaptor_weight = sketch_adaptor_weight * torch.ones(*feat_sketch.shape[2:]).to(
+                    feat_sketch.dtype).to(feat_sketch.device)
+                if region_sketch_adaptor_weight != '':
+                    region_list = region_sketch_adaptor_weight.split('|')
+                    for region_weight in region_list:
+                        region, weight = region_weight.split('-')
+                        region = eval(region)
+                        weight = eval(weight)
+                        feat_height, feat_width = feat_sketch.shape[2:]
+                        start_h, start_w, end_h, end_w = region
+                        start_h, end_h = start_h / height, end_h / height
+                        start_w, end_w = start_w / width, end_w / width
+                        start_h, start_w, end_h, end_w = math.ceil(start_h * feat_height), math.ceil(
+                            start_w * feat_width), math.floor(end_h * feat_height), math.floor(end_w * feat_width)
+                        spatial_adaptor_weight[start_h:end_h, start_w:end_w] = weight
+                feat_sketch = spatial_adaptor_weight * feat_sketch
+            else:
+                feat_sketch = 0
+            adapter_state.append(feat_keypose + feat_sketch)
+        if do_classifier_free_guidance:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = torch.cat([v] * 2, dim=0)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs={
+                        'region_list': region_list,
+                        'height': height,
+                        'width': width,
+                    },
+                    down_block_additional_residuals=[state.clone() for state in adapter_state],
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == 'latent':
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == 'pil':
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        # Offload last model to CPU
+        if hasattr(self, 'final_offload_hook') and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionAdapterPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

mixofshow/pipelines/trainer_edlora.py ADDED Viewed

	@@ -0,0 +1,380 @@

+import itertools
+import math
+import re
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from accelerate.logging import get_logger
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange
+from transformers import CLIPTextModel, CLIPTokenizer
+from mixofshow.models.edlora import (LoRALinearLayer, revise_edlora_unet_attention_controller_forward,
+                                     revise_edlora_unet_attention_forward)
+from mixofshow.pipelines.pipeline_edlora import bind_concept_prompt
+from mixofshow.utils.ptp_util import AttentionStore
+class EDLoRATrainer(nn.Module):
+    def __init__(
+        self,
+        pretrained_path,
+        new_concept_token,
+        initializer_token,
+        enable_edlora,  # true for ED-LoRA, false for LoRA
+        finetune_cfg=None,
+        noise_offset=None,
+        attn_reg_weight=None,
+        reg_full_identity=True,  # True for thanos, False for real person (don't need to encode clothes)
+        use_mask_loss=True,
+        enable_xformers=False,
+        gradient_checkpoint=False
+    ):
+        super().__init__()
+        # 1. Load the model.
+        self.vae = AutoencoderKL.from_pretrained(pretrained_path, subfolder='vae')
+        self.tokenizer = CLIPTokenizer.from_pretrained(pretrained_path, subfolder='tokenizer')
+        self.text_encoder = CLIPTextModel.from_pretrained(pretrained_path, subfolder='text_encoder')
+        self.unet = UNet2DConditionModel.from_pretrained(pretrained_path, subfolder='unet')
+        if gradient_checkpoint:
+            self.unet.enable_gradient_checkpointing()
+        if enable_xformers:
+            assert is_xformers_available(), 'need to install xformer first'
+        # 2. Define train scheduler
+        self.scheduler = DDPMScheduler.from_pretrained(pretrained_path, subfolder='scheduler')
+        # 3. define training cfg
+        self.enable_edlora = enable_edlora
+        self.new_concept_cfg = self.init_new_concept(new_concept_token, initializer_token, enable_edlora=enable_edlora)
+        self.attn_reg_weight = attn_reg_weight
+        self.reg_full_identity = reg_full_identity
+        if self.attn_reg_weight is not None:
+            self.controller = AttentionStore(training=True)
+            revise_edlora_unet_attention_controller_forward(self.unet, self.controller)  # support both lora and edlora forward
+        else:
+            revise_edlora_unet_attention_forward(self.unet)  # support both lora and edlora forward
+        if finetune_cfg:
+            self.set_finetune_cfg(finetune_cfg)
+        self.noise_offset = noise_offset
+        self.use_mask_loss = use_mask_loss
+    def set_finetune_cfg(self, finetune_cfg):
+        logger = get_logger('mixofshow', log_level='INFO')
+        params_to_freeze = [self.vae.parameters(), self.text_encoder.parameters(), self.unet.parameters()]
+        # step 1: close all parameters, required_grad to False
+        for params in itertools.chain(*params_to_freeze):
+            params.requires_grad = False
+        # step 2: begin to add trainable paramters
+        params_group_list = []
+        # 1. text embedding
+        if finetune_cfg['text_embedding']['enable_tuning']:
+            text_embedding_cfg = finetune_cfg['text_embedding']
+            params_list = []
+            for params in self.text_encoder.get_input_embeddings().parameters():
+                params.requires_grad = True
+                params_list.append(params)
+            params_group = {'params': params_list, 'lr': text_embedding_cfg['lr']}
+            if 'weight_decay' in text_embedding_cfg:
+                params_group.update({'weight_decay': text_embedding_cfg['weight_decay']})
+            params_group_list.append(params_group)
+            logger.info(f"optimizing embedding using lr: {text_embedding_cfg['lr']}")
+        # 2. text encoder
+        if finetune_cfg['text_encoder']['enable_tuning'] and finetune_cfg['text_encoder'].get('lora_cfg'):
+            text_encoder_cfg = finetune_cfg['text_encoder']
+            where = text_encoder_cfg['lora_cfg'].pop('where')
+            assert where in ['CLIPEncoderLayer', 'CLIPAttention']
+            self.text_encoder_lora = nn.ModuleList()
+            params_list = []
+            for name, module in self.text_encoder.named_modules():
+                if module.__class__.__name__ == where:
+                    for child_name, child_module in module.named_modules():
+                        if child_module.__class__.__name__ == 'Linear':
+                            lora_module = LoRALinearLayer(name + '.' + child_name, child_module, **text_encoder_cfg['lora_cfg'])
+                            self.text_encoder_lora.append(lora_module)
+                            params_list.extend(list(lora_module.parameters()))
+            params_group_list.append({'params': params_list, 'lr': text_encoder_cfg['lr']})
+            logger.info(f"optimizing text_encoder ({len(self.text_encoder_lora)} LoRAs), using lr: {text_encoder_cfg['lr']}")
+        # 3. unet
+        if finetune_cfg['unet']['enable_tuning'] and finetune_cfg['unet'].get('lora_cfg'):
+            unet_cfg = finetune_cfg['unet']
+            where = unet_cfg['lora_cfg'].pop('where')
+            assert where in ['Transformer2DModel', 'Attention']
+            self.unet_lora = nn.ModuleList()
+            params_list = []
+            for name, module in self.unet.named_modules():
+                if module.__class__.__name__ == where:
+                    for child_name, child_module in module.named_modules():
+                        if child_module.__class__.__name__ == 'Linear' or (child_module.__class__.__name__ == 'Conv2d' and child_module.kernel_size == (1, 1)):
+                            lora_module = LoRALinearLayer(name + '.' + child_name, child_module, **unet_cfg['lora_cfg'])
+                            self.unet_lora.append(lora_module)
+                            params_list.extend(list(lora_module.parameters()))
+            params_group_list.append({'params': params_list, 'lr': unet_cfg['lr']})
+            logger.info(f"optimizing unet ({len(self.unet_lora)} LoRAs), using lr: {unet_cfg['lr']}")
+        # 4. optimize params
+        self.params_to_optimize_iterator = params_group_list
+    def get_params_to_optimize(self):
+        return self.params_to_optimize_iterator
+    def init_new_concept(self, new_concept_tokens, initializer_tokens, enable_edlora=True):
+        logger = get_logger('mixofshow', log_level='INFO')
+        new_concept_cfg = {}
+        new_concept_tokens = new_concept_tokens.split('+')
+        if initializer_tokens is None:
+            initializer_tokens = ['<rand-0.017>'] * len(new_concept_tokens)
+        else:
+            initializer_tokens = initializer_tokens.split('+')
+        assert len(new_concept_tokens) == len(initializer_tokens), 'concept token should match init token.'
+        for idx, (concept_name, init_token) in enumerate(zip(new_concept_tokens, initializer_tokens)):
+            if enable_edlora:
+                num_new_embedding = 16
+            else:
+                num_new_embedding = 1
+            new_token_names = [f'<new{idx * num_new_embedding + layer_id}>' for layer_id in range(num_new_embedding)]
+            num_added_tokens = self.tokenizer.add_tokens(new_token_names)
+            assert num_added_tokens == len(new_token_names), 'some token is already in tokenizer'
+            new_token_ids = [self.tokenizer.convert_tokens_to_ids(token_name) for token_name in new_token_names]
+            # init embedding
+            self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+            token_embeds = self.text_encoder.get_input_embeddings().weight.data
+            if init_token.startswith('<rand'):
+                sigma_val = float(re.findall(r'<rand-(.*)>', init_token)[0])
+                init_feature = torch.randn_like(token_embeds[0]) * sigma_val
+                logger.info(f'{concept_name} ({min(new_token_ids)}-{max(new_token_ids)}) is random initialized by: {init_token}')
+            else:
+                # Convert the initializer_token, placeholder_token to ids
+                init_token_ids = self.tokenizer.encode(init_token, add_special_tokens=False)
+                # print(token_ids)
+                # Check if initializer_token is a single token or a sequence of tokens
+                if len(init_token_ids) > 1 or init_token_ids[0] == 40497:
+                    raise ValueError('The initializer token must be a single existing token.')
+                init_feature = token_embeds[init_token_ids]
+                logger.info(f'{concept_name} ({min(new_token_ids)}-{max(new_token_ids)}) is random initialized by existing token ({init_token}): {init_token_ids[0]}')
+            for token_id in new_token_ids:
+                token_embeds[token_id] = init_feature.clone()
+            new_concept_cfg.update({
+                concept_name: {
+                    'concept_token_ids': new_token_ids,
+                    'concept_token_names': new_token_names
+                }
+            })
+        return new_concept_cfg
+    def get_all_concept_token_ids(self):
+        new_concept_token_ids = []
+        for _, new_token_cfg in self.new_concept_cfg.items():
+            new_concept_token_ids.extend(new_token_cfg['concept_token_ids'])
+        return new_concept_token_ids
+    def forward(self, images, prompts, masks, img_masks):
+        latents = self.vae.encode(images).latent_dist.sample()
+        latents = latents * 0.18215
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        if self.noise_offset is not None:
+            noise += self.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+        bsz = latents.shape[0]
+        # Sample a random timestep for each image
+        timesteps = torch.randint(0, self.scheduler.config.num_train_timesteps, (bsz, ), device=latents.device)
+        timesteps = timesteps.long()
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_latents = self.scheduler.add_noise(latents, noise, timesteps)
+        if self.enable_edlora:
+            prompts = bind_concept_prompt(prompts, new_concept_cfg=self.new_concept_cfg)  # edlora
+        # get text ids
+        text_input_ids = self.tokenizer(
+            prompts,
+            padding='max_length',
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors='pt').input_ids.to(latents.device)
+        # Get the text embedding for conditioning
+        encoder_hidden_states = self.text_encoder(text_input_ids)[0]
+        if self.enable_edlora:
+            encoder_hidden_states = rearrange(encoder_hidden_states, '(b n) m c -> b n m c', b=latents.shape[0])  # edlora
+        # Predict the noise residual
+        model_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
+        # Get the target for loss depending on the prediction type
+        if self.scheduler.config.prediction_type == 'epsilon':
+            target = noise
+        elif self.scheduler.config.prediction_type == 'v_prediction':
+            target = self.scheduler.get_velocity(latents, noise, timesteps)
+        else:
+            raise ValueError(f'Unknown prediction type {self.scheduler.config.prediction_type}')
+        if self.use_mask_loss:
+            loss_mask = masks
+        else:
+            loss_mask = img_masks
+        loss = F.mse_loss(model_pred.float(), target.float(), reduction='none')
+        loss = ((loss * loss_mask).sum([1, 2, 3]) / loss_mask.sum([1, 2, 3])).mean()
+        if self.attn_reg_weight is not None:
+            attention_maps = self.controller.get_average_attention()
+            attention_loss = self.cal_attn_reg(attention_maps, masks, text_input_ids)
+            if not torch.isnan(attention_loss):  # full mask
+                loss = loss + attention_loss
+            self.controller.reset()
+        return loss
+    def cal_attn_reg(self, attention_maps, masks, text_input_ids):
+        '''
+        attention_maps: {down_cross:[], mid_cross:[], up_cross:[]}
+        masks: torch.Size([1, 1, 64, 64])
+        text_input_ids: torch.Size([16, 77])
+        '''
+        # step 1: find token position
+        batch_size = masks.shape[0]
+        text_input_ids = rearrange(text_input_ids, '(b l) n -> b l n', b=batch_size)
+        # print(masks.shape) # torch.Size([2, 1, 64, 64])
+        # print(text_input_ids.shape) # torch.Size([2, 16, 77])
+        new_token_pos = []
+        all_concept_token_ids = self.get_all_concept_token_ids()
+        for text in text_input_ids:
+            text = text[0]  # even multi-layer embedding, we extract the first one
+            new_token_pos.append([idx for idx in range(len(text)) if text[idx] in all_concept_token_ids])
+        # step2: aggregate attention maps with resolution and concat heads
+        attention_groups = {'64': [], '32': [], '16': [], '8': []}
+        for _, attention_list in attention_maps.items():
+            for attn in attention_list:
+                res = int(math.sqrt(attn.shape[1]))
+                cross_map = attn.reshape(batch_size, -1, res, res, attn.shape[-1])
+                attention_groups[str(res)].append(cross_map)
+        for k, cross_map in attention_groups.items():
+            cross_map = torch.cat(cross_map, dim=-4)  # concat heads
+            cross_map = cross_map.sum(-4) / cross_map.shape[-4]  # e.g., 64 torch.Size([2, 64, 64, 77])
+            cross_map = torch.stack([batch_map[..., batch_pos] for batch_pos, batch_map in zip(new_token_pos, cross_map)])  # torch.Size([2, 64, 64, 2])
+            attention_groups[k] = cross_map
+        attn_reg_total = 0
+        # step3: calculate loss for each resolution: <new1> <new2> -> <new1> is to penalize outside mask, <new2> to align with mask
+        for k, cross_map in attention_groups.items():
+            map_adjective, map_subject = cross_map[..., 0], cross_map[..., 1]
+            map_subject = map_subject / map_subject.max()
+            map_adjective = map_adjective / map_adjective.max()
+            gt_mask = F.interpolate(masks, size=map_subject.shape[1:], mode='nearest').squeeze(1)
+            if self.reg_full_identity:
+                loss_subject = F.mse_loss(map_subject.float(), gt_mask.float(), reduction='mean')
+            else:
+                loss_subject = map_subject[gt_mask == 0].mean()
+            loss_adjective = map_adjective[gt_mask == 0].mean()
+            attn_reg_total += self.attn_reg_weight * (loss_subject + loss_adjective)
+        return attn_reg_total
+    def load_delta_state_dict(self, delta_state_dict):
+        # load embedding
+        logger = get_logger('mixofshow', log_level='INFO')
+        if 'new_concept_embedding' in delta_state_dict and len(delta_state_dict['new_concept_embedding']) != 0:
+            new_concept_tokens = list(delta_state_dict['new_concept_embedding'].keys())
+            # check whether new concept is initialized
+            token_embeds = self.text_encoder.get_input_embeddings().weight.data
+            if set(new_concept_tokens) != set(self.new_concept_cfg.keys()):
+                logger.warning('Your checkpoint have different concept with your model, loading existing concepts')
+            for concept_name, concept_cfg in self.new_concept_cfg.items():
+                logger.info(f'load: concept_{concept_name}')
+                token_embeds[concept_cfg['concept_token_ids']] = token_embeds[
+                    concept_cfg['concept_token_ids']].copy_(delta_state_dict['new_concept_embedding'][concept_name])
+        # load text_encoder
+        if 'text_encoder' in delta_state_dict and len(delta_state_dict['text_encoder']) != 0:
+            load_keys = delta_state_dict['text_encoder'].keys()
+            if hasattr(self, 'text_encoder_lora') and len(load_keys) == 2 * len(self.text_encoder_lora):
+                logger.info('loading LoRA for text encoder:')
+                for lora_module in self.text_encoder_lora:
+                    for name, param, in lora_module.named_parameters():
+                        logger.info(f'load: {lora_module.name}.{name}')
+                        param.data.copy_(delta_state_dict['text_encoder'][f'{lora_module.name}.{name}'])
+            else:
+                for name, param, in self.text_encoder.named_parameters():
+                    if name in load_keys and 'token_embedding' not in name:
+                        logger.info(f'load: {name}')
+                        param.data.copy_(delta_state_dict['text_encoder'][f'{name}'])
+        # load unet
+        if 'unet' in delta_state_dict and len(delta_state_dict['unet']) != 0:
+            load_keys = delta_state_dict['unet'].keys()
+            if hasattr(self, 'unet_lora') and len(load_keys) == 2 * len(self.unet_lora):
+                logger.info('loading LoRA for unet:')
+                for lora_module in self.unet_lora:
+                    for name, param, in lora_module.named_parameters():
+                        logger.info(f'load: {lora_module.name}.{name}')
+                        param.data.copy_(delta_state_dict['unet'][f'{lora_module.name}.{name}'])
+            else:
+                for name, param, in self.unet.named_parameters():
+                    if name in load_keys:
+                        logger.info(f'load: {name}')
+                        param.data.copy_(delta_state_dict['unet'][f'{name}'])
+    def delta_state_dict(self):
+        delta_dict = {'new_concept_embedding': {}, 'text_encoder': {}, 'unet': {}}
+        # save_embedding
+        for concept_name, concept_cfg in self.new_concept_cfg.items():
+            learned_embeds = self.text_encoder.get_input_embeddings().weight[concept_cfg['concept_token_ids']]
+            delta_dict['new_concept_embedding'][concept_name] = learned_embeds.detach().cpu()
+        # save text model
+        for lora_module in self.text_encoder_lora:
+            for name, param, in lora_module.named_parameters():
+                delta_dict['text_encoder'][f'{lora_module.name}.{name}'] = param.cpu().clone()
+        # save unet model
+        for lora_module in self.unet_lora:
+            for name, param, in lora_module.named_parameters():
+                delta_dict['unet'][f'{lora_module.name}.{name}'] = param.cpu().clone()
+        return delta_dict

mixofshow/utils/__init__.py ADDED Viewed

File without changes

mixofshow/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (151 Bytes). View file

mixofshow/utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (149 Bytes). View file

mixofshow/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (149 Bytes). View file

mixofshow/utils/__pycache__/convert_edlora_to_diffusers.cpython-38.pyc ADDED Viewed

Binary file (3.64 kB). View file

mixofshow/utils/__pycache__/convert_edlora_to_diffusers.cpython-39.pyc ADDED Viewed

Binary file (3.58 kB). View file

mixofshow/utils/__pycache__/ptp_util.cpython-38.pyc ADDED Viewed

Binary file (7.21 kB). View file

mixofshow/utils/__pycache__/ptp_util.cpython-39.pyc ADDED Viewed

Binary file (7.2 kB). View file

mixofshow/utils/__pycache__/registry.cpython-38.pyc ADDED Viewed

Binary file (2.49 kB). View file

mixofshow/utils/__pycache__/registry.cpython-39.pyc ADDED Viewed

Binary file (2.48 kB). View file

mixofshow/utils/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (9.59 kB). View file

mixofshow/utils/__pycache__/util.cpython-38.pyc ADDED Viewed

Binary file (9.51 kB). View file

mixofshow/utils/__pycache__/util.cpython-39.pyc ADDED Viewed

Binary file (9.51 kB). View file

mixofshow/utils/arial.ttf ADDED Viewed

Binary file (367 kB). View file

mixofshow/utils/convert_edlora_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import copy
+def load_new_concept(pipe, new_concept_embedding, enable_edlora=True):
+    new_concept_cfg = {}
+    for idx, (concept_name, concept_embedding) in enumerate(new_concept_embedding.items()):
+        if enable_edlora:
+            num_new_embedding = 16
+        else:
+            num_new_embedding = 1
+        new_token_names = [f'<new{idx * num_new_embedding + layer_id}>' for layer_id in range(num_new_embedding)]
+        num_added_tokens = pipe.tokenizer.add_tokens(new_token_names)
+        assert num_added_tokens == len(new_token_names), 'some token is already in tokenizer'
+        new_token_ids = [pipe.tokenizer.convert_tokens_to_ids(token_name) for token_name in new_token_names]
+        # init embedding
+        pipe.text_encoder.resize_token_embeddings(len(pipe.tokenizer))
+        token_embeds = pipe.text_encoder.get_input_embeddings().weight.data
+        token_embeds[new_token_ids] = concept_embedding.clone().to(token_embeds.device, dtype=token_embeds.dtype)
+        print(f'load embedding: {concept_name}')
+        new_concept_cfg.update({
+            concept_name: {
+                'concept_token_ids': new_token_ids,
+                'concept_token_names': new_token_names
+            }
+        })
+    return pipe, new_concept_cfg
+def merge_lora_into_weight(original_state_dict, lora_state_dict, model_type, alpha):
+    def get_lora_down_name(original_layer_name):
+        if model_type == 'text_encoder':
+            lora_down_name = original_layer_name.replace('q_proj.weight', 'q_proj.lora_down.weight') \
+                .replace('k_proj.weight', 'k_proj.lora_down.weight') \
+                .replace('v_proj.weight', 'v_proj.lora_down.weight') \
+                .replace('out_proj.weight', 'out_proj.lora_down.weight') \
+                .replace('fc1.weight', 'fc1.lora_down.weight') \
+                .replace('fc2.weight', 'fc2.lora_down.weight')
+        else:
+            lora_down_name = k.replace('to_q.weight', 'to_q.lora_down.weight') \
+                .replace('to_k.weight', 'to_k.lora_down.weight') \
+                .replace('to_v.weight', 'to_v.lora_down.weight') \
+                .replace('to_out.0.weight', 'to_out.0.lora_down.weight') \
+                .replace('ff.net.0.proj.weight', 'ff.net.0.proj.lora_down.weight') \
+                .replace('ff.net.2.weight', 'ff.net.2.lora_down.weight') \
+                .replace('proj_out.weight', 'proj_out.lora_down.weight') \
+                .replace('proj_in.weight', 'proj_in.lora_down.weight')
+        return lora_down_name
+    assert model_type in ['unet', 'text_encoder']
+    new_state_dict = copy.deepcopy(original_state_dict)
+    load_cnt = 0
+    for k in new_state_dict.keys():
+        lora_down_name = get_lora_down_name(k)
+        lora_up_name = lora_down_name.replace('lora_down', 'lora_up')
+        if lora_up_name in lora_state_dict:
+            load_cnt += 1
+            original_params = new_state_dict[k]
+            lora_down_params = lora_state_dict[lora_down_name].to(original_params.device)
+            lora_up_params = lora_state_dict[lora_up_name].to(original_params.device)
+            if len(original_params.shape) == 4:
+                lora_param = lora_up_params.squeeze() @ lora_down_params.squeeze()
+                lora_param = lora_param.unsqueeze(-1).unsqueeze(-1)
+            else:
+                lora_param = lora_up_params @ lora_down_params
+            merge_params = original_params + alpha * lora_param
+            new_state_dict[k] = merge_params
+    print(f'load {load_cnt} LoRAs of {model_type}')
+    return new_state_dict
+def convert_edlora(pipe, state_dict, enable_edlora, alpha=0.6):
+    state_dict = state_dict['params'] if 'params' in state_dict.keys() else state_dict
+    # step 1: load embedding
+    if 'new_concept_embedding' in state_dict and len(state_dict['new_concept_embedding']) != 0:
+        pipe, new_concept_cfg = load_new_concept(pipe, state_dict['new_concept_embedding'], enable_edlora)
+    # step 2: merge lora weight to unet
+    unet_lora_state_dict = state_dict['unet']
+    pretrained_unet_state_dict = pipe.unet.state_dict()
+    updated_unet_state_dict = merge_lora_into_weight(pretrained_unet_state_dict, unet_lora_state_dict, model_type='unet', alpha=alpha)
+    pipe.unet.load_state_dict(updated_unet_state_dict)
+    # step 3: merge lora weight to text_encoder
+    text_encoder_lora_state_dict = state_dict['text_encoder']
+    pretrained_text_encoder_state_dict = pipe.text_encoder.state_dict()
+    updated_text_encoder_state_dict = merge_lora_into_weight(pretrained_text_encoder_state_dict, text_encoder_lora_state_dict, model_type='text_encoder', alpha=alpha)
+    pipe.text_encoder.load_state_dict(updated_text_encoder_state_dict)
+    return pipe, new_concept_cfg

mixofshow/utils/ptp_util.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import abc
+from typing import List, Tuple
+import cv2
+import numpy as np
+import torch
+from IPython.display import display
+from PIL import Image
+class EmptyControl:
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        return attn
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return self.num_att_layers if self.low_resource else 0
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            if self.low_resource:
+                attn = self.forward(attn, is_cross, place_in_unet)
+            else:
+                if self.training:
+                    attn = self.forward(attn, is_cross, place_in_unet)
+                else:
+                    h = attn.shape[0]
+                    attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self, low_resource, training):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+        self.low_resource = low_resource
+        self.training = training
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {
+            'down_cross': [],
+            'mid_cross': [],
+            'up_cross': [],
+            'down_self': [],
+            'mid_self': [],
+            'up_self': []
+        }
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] = self.attention_store[key][i] + self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {
+            key: [item / self.cur_step for item in self.attention_store[key]]
+            for key in self.attention_store
+        }
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self, low_resource=False, training=False):
+        super(AttentionStore, self).__init__(low_resource, training)
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+def text_under_image(image: np.ndarray,
+                     text: str,
+                     text_color: Tuple[int, int, int] = (0, 0, 0)):
+    h, w, c = image.shape
+    offset = int(h * .2)
+    img = np.ones((h + offset, w, c), dtype=np.uint8) * 255
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    # font = ImageFont.truetype("/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf", font_size)
+    img[:h] = image
+    textsize = cv2.getTextSize(text, font, 1, 2)[0]
+    text_x, text_y = (w - textsize[0]) // 2, h + offset - textsize[1] // 2
+    cv2.putText(img, text, (text_x, text_y), font, 1, text_color, 2)
+    return img
+def view_images(images, num_rows=1, offset_ratio=0.02, notebook=True):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
+    images = [image.astype(np.uint8)
+              for image in images] + [empty_images] * num_empty
+    num_items = len(images)
+    h, w, c = images[0].shape
+    offset = int(h * offset_ratio)
+    num_cols = num_items // num_rows
+    image_ = np.ones(
+        (h * num_rows + offset * (num_rows - 1), w * num_cols + offset *
+         (num_cols - 1), 3),
+        dtype=np.uint8) * 255
+    for i in range(num_rows):
+        for j in range(num_cols):
+            image_[i * (h + offset):i * (h + offset) + h:, j * (w + offset):j *
+                   (w + offset) + w] = images[i * num_cols + j]
+    pil_img = Image.fromarray(image_)
+    if notebook is True:
+        display(pil_img)
+    else:
+        return pil_img
+def aggregate_attention(attention_store: AttentionStore, res: int,
+                        from_where: List[str], prompts: List[str],
+                        is_cross: bool, select: int):
+    out = []
+    attention_maps = attention_store.get_average_attention()
+    num_pixels = res**2
+    for location in from_where:
+        for item in attention_maps[
+                f"{location}_{'cross' if is_cross else 'self'}"]:
+            if item.shape[1] == num_pixels:
+                cross_maps = item.reshape(len(prompts), -1, res, res, item.shape[-1])[select]
+                out.append(cross_maps)
+    out = torch.cat(out, dim=0)
+    out = out.sum(0) / out.shape[0]
+    return out.cpu()
+def show_cross_attention(attention_store: AttentionStore,
+                         res: int,
+                         from_where: List[str],
+                         prompts: List[str],
+                         tokenizer,
+                         select: int = 0,
+                         notebook=True):
+    tokens = tokenizer.encode(prompts[select])
+    decoder = tokenizer.decode
+    attention_maps = aggregate_attention(attention_store, res, from_where, prompts, True, select)
+    images = []
+    for i in range(len(tokens)):
+        image = attention_maps[:, :, i]
+        image = 255 * image / image.max()
+        image = image.unsqueeze(-1).expand(*image.shape, 3)
+        image = image.numpy().astype(np.uint8)
+        image = np.array(Image.fromarray(image).resize((256, 256)))
+        image = text_under_image(image, decoder(int(tokens[i])))
+        images.append(image)
+    if notebook is True:
+        view_images(np.stack(images, axis=0))
+    else:
+        return view_images(np.stack(images, axis=0), notebook=False)

mixofshow/utils/registry.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Modified from: https://github.com/facebookresearch/fvcore/blob/master/fvcore/common/registry.py  # noqa: E501
+class Registry():
+    """
+    The registry that provides name -> object mapping, to support third-party
+    users' custom modules.
+    To create a registry (e.g. a backbone registry):
+    .. code-block:: python
+        BACKBONE_REGISTRY = Registry('BACKBONE')
+    To register an object:
+    .. code-block:: python
+        @BACKBONE_REGISTRY.register()
+        class MyBackbone():
+            ...
+    Or:
+    .. code-block:: python
+        BACKBONE_REGISTRY.register(MyBackbone)
+    """
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+        self._obj_map = {}
+    def _do_register(self, name, obj):
+        assert (name not in self._obj_map), (
+            f"An object named '{name}' was already registered "
+            f"in '{self._name}' registry!")
+        self._obj_map[name] = obj
+    def register(self, obj=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not.
+        See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class):
+                name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+            return deco
+        # used as a function call
+        name = obj.__name__
+        self._do_register(name, obj)
+    def get(self, name):
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                f"No object named '{name}' found in '{self._name}' registry!")
+        return ret
+    def __contains__(self, name):
+        return name in self._obj_map
+    def __iter__(self):
+        return iter(self._obj_map.items())
+    def keys(self):
+        return self._obj_map.keys()
+TRANSFORM_REGISTRY = Registry('transform')

mixofshow/utils/util.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import datetime
+import logging
+import os
+import os.path
+import os.path as osp
+import time
+from collections import OrderedDict
+import PIL
+import torch
+from accelerate.logging import get_logger
+from accelerate.state import PartialState
+from PIL import Image, ImageDraw, ImageFont
+from torchvision.transforms.transforms import ToTensor
+from torchvision.utils import make_grid
+NEGATIVE_PROMPT = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
+# ----------- file/logger util ----------
+def get_time_str():
+    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
+def mkdir_and_rename(path):
+    """mkdirs. If path exists, rename it with timestamp and create a new one.
+    Args:
+        path (str): Folder path.
+    """
+    if osp.exists(path):
+        new_name = path + '_archived_' + get_time_str()
+        print(f'Path already exists. Rename it to {new_name}', flush=True)
+        os.rename(path, new_name)
+    os.makedirs(path, exist_ok=True)
+def make_exp_dirs(opt):
+    """Make dirs for experiments."""
+    path_opt = opt['path'].copy()
+    if opt['is_train']:
+        mkdir_and_rename(path_opt.pop('experiments_root'))
+    else:
+        mkdir_and_rename(path_opt.pop('results_root'))
+    for key, path in path_opt.items():
+        if ('strict_load' in key) or ('pretrain_network' in key) or (
+                'resume' in key) or ('param_key' in key) or ('lora_path' in key):
+            continue
+        else:
+            os.makedirs(path, exist_ok=True)
+def copy_opt_file(opt_file, experiments_root):
+    # copy the yml file to the experiment root
+    import sys
+    import time
+    from shutil import copyfile
+    cmd = ' '.join(sys.argv)
+    filename = osp.join(experiments_root, osp.basename(opt_file))
+    copyfile(opt_file, filename)
+    with open(filename, 'r+') as f:
+        lines = f.readlines()
+        lines.insert(
+            0, f'# GENERATE TIME: {time.asctime()}\n# CMD:\n# {cmd}\n\n')
+        f.seek(0)
+        f.writelines(lines)
+def set_path_logger(accelerator, root_path, config_path, opt, is_train=True):
+    opt['is_train'] = is_train
+    if is_train:
+        experiments_root = osp.join(root_path, 'experiments', opt['name'])
+        opt['path']['experiments_root'] = experiments_root
+        opt['path']['models'] = osp.join(experiments_root, 'models')
+        opt['path']['log'] = experiments_root
+        opt['path']['visualization'] = osp.join(experiments_root,
+                                                'visualization')
+    else:
+        results_root = osp.join(root_path, 'results', opt['name'])
+        opt['path']['results_root'] = results_root
+        opt['path']['log'] = results_root
+        opt['path']['visualization'] = osp.join(results_root, 'visualization')
+    # Handle the output folder creation
+    if accelerator.is_main_process:
+        make_exp_dirs(opt)
+    accelerator.wait_for_everyone()
+    if is_train:
+        copy_opt_file(config_path, opt['path']['experiments_root'])
+        log_file = osp.join(opt['path']['log'],
+                            f"train_{opt['name']}_{get_time_str()}.log")
+        set_logger(log_file)
+    else:
+        copy_opt_file(config_path, opt['path']['results_root'])
+        log_file = osp.join(opt['path']['log'],
+                            f"test_{opt['name']}_{get_time_str()}.log")
+        set_logger(log_file)
+def set_logger(log_file=None):
+    # Make one log on every process with the configuration for debugging.
+    format_str = '%(asctime)s %(levelname)s: %(message)s'
+    log_level = logging.INFO
+    handlers = []
+    file_handler = logging.FileHandler(log_file, 'w')
+    file_handler.setFormatter(logging.Formatter(format_str))
+    file_handler.setLevel(log_level)
+    handlers.append(file_handler)
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(logging.Formatter(format_str))
+    handlers.append(stream_handler)
+    logging.basicConfig(handlers=handlers, level=log_level)
+def dict2str(opt, indent_level=1):
+    """dict to string for printing options.
+    Args:
+        opt (dict): Option dict.
+        indent_level (int): Indent level. Default: 1.
+    Return:
+        (str): Option string for printing.
+    """
+    msg = '\n'
+    for k, v in opt.items():
+        if isinstance(v, dict):
+            msg += ' ' * (indent_level * 2) + k + ':['
+            msg += dict2str(v, indent_level + 1)
+            msg += ' ' * (indent_level * 2) + ']\n'
+        else:
+            msg += ' ' * (indent_level * 2) + k + ': ' + str(v) + '\n'
+    return msg
+class MessageLogger():
+    """Message logger for printing.
+    Args:
+        opt (dict): Config. It contains the following keys:
+            name (str): Exp name.
+            logger (dict): Contains 'print_freq' (str) for logger interval.
+            train (dict): Contains 'total_iter' (int) for total iters.
+            use_tb_logger (bool): Use tensorboard logger.
+        start_iter (int): Start iter. Default: 1.
+        tb_logger (obj:`tb_logger`): Tensorboard logger. Default： None.
+    """
+    def __init__(self, opt, start_iter=1):
+        self.exp_name = opt['name']
+        self.interval = opt['logger']['print_freq']
+        self.start_iter = start_iter
+        self.max_iters = opt['train']['total_iter']
+        self.start_time = time.time()
+        self.logger = get_logger('mixofshow', log_level='INFO')
+    def reset_start_time(self):
+        self.start_time = time.time()
+    def __call__(self, log_vars):
+        """Format logging message.
+        Args:
+            log_vars (dict): It contains the following keys:
+                epoch (int): Epoch number.
+                iter (int): Current iter.
+                lrs (list): List for learning rates.
+                time (float): Iter time.
+                data_time (float): Data time for each iter.
+        """
+        # epoch, iter, learning rates
+        current_iter = log_vars.pop('iter')
+        lrs = log_vars.pop('lrs')
+        message = (
+            f'[{self.exp_name[:5]}..][Iter:{current_iter:8,d}, lr:('
+        )
+        for v in lrs:
+            message += f'{v:.3e},'
+        message += ')] '
+        # time and estimated time
+        total_time = time.time() - self.start_time
+        time_sec_avg = total_time / (current_iter - self.start_iter + 1)
+        eta_sec = time_sec_avg * (self.max_iters - current_iter - 1)
+        eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+        message += f'[eta: {eta_str}] '
+        # other items, especially losses
+        for k, v in log_vars.items():
+            message += f'{k}: {v:.4e} '
+        self.logger.info(message)
+def reduce_loss_dict(accelerator, loss_dict):
+    """reduce loss dict.
+    In distributed training, it averages the losses among different GPUs .
+    Args:
+        loss_dict (OrderedDict): Loss dict.
+    """
+    with torch.no_grad():
+        keys = []
+        losses = []
+        for name, value in loss_dict.items():
+            keys.append(name)
+            losses.append(value)
+        losses = torch.stack(losses, 0)
+        losses = accelerator.reduce(losses)
+        world_size = PartialState().num_processes
+        losses /= world_size
+        loss_dict = {key: loss for key, loss in zip(keys, losses)}
+        log_dict = OrderedDict()
+        for name, value in loss_dict.items():
+            log_dict[name] = value.mean().item()
+        return log_dict
+def pil_imwrite(img, file_path, auto_mkdir=True):
+    """Write image to file.
+    Args:
+        img (ndarray): Image array to be written.
+        file_path (str): Image file path.
+        params (None or list): Same as opencv's :func:`imwrite` interface.
+        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+            whether to create it automatically.
+    Returns:
+        bool: Successful or not.
+    """
+    assert isinstance(
+        img, PIL.Image.Image), 'model should return a list of PIL images'
+    if auto_mkdir:
+        dir_name = os.path.abspath(os.path.dirname(file_path))
+        os.makedirs(dir_name, exist_ok=True)
+    img.save(file_path)
+def draw_prompt(text, height, width, font_size=45):
+    img = Image.new('RGB', (width, height), (255, 255, 255))
+    draw = ImageDraw.Draw(img)
+    font = ImageFont.truetype(
+        osp.join(osp.dirname(osp.abspath(__file__)), 'arial.ttf'), font_size)
+    guess_count = 0
+    while font.font.getsize(text[:guess_count])[0][
+            0] + 0.1 * width < width - 0.1 * width and guess_count < len(
+                text):  # centerize
+        guess_count += 1
+    text_new = ''
+    for idx, s in enumerate(text):
+        if idx % guess_count == 0:
+            text_new += '\n'
+            if s == ' ':
+                s = ''  # new line trip the first space
+        text_new += s
+    draw.text([int(0.1 * width), int(0.3 * height)],
+              text_new,
+              font=font,
+              fill='black')
+    return img
+def compose_visualize(dir_path):
+    file_list = sorted(os.listdir(dir_path))
+    img_list = []
+    info_dict = {'prompts': set(), 'sample_args': set(), 'suffix': set()}
+    for filename in file_list:
+        prompt, sample_args, index, suffix = osp.splitext(
+            osp.basename(filename))[0].split('---')
+        filepath = osp.join(dir_path, filename)
+        img = ToTensor()(Image.open(filepath))
+        height, width = img.shape[1:]
+        if prompt not in info_dict['prompts']:
+            img_list.append(ToTensor()(draw_prompt(prompt,
+                                                   height=height,
+                                                   width=width,
+                                                   font_size=45)))
+        info_dict['prompts'].add(prompt)
+        info_dict['sample_args'].add(sample_args)
+        info_dict['suffix'].add(suffix)
+        img_list.append(img)
+    assert len(
+        info_dict['sample_args']
+    ) == 1, 'compose dir should contain images form same sample args.'
+    assert len(info_dict['suffix']
+               ) == 1, 'compose dir should contain images form same suffix.'
+    grid = make_grid(img_list, nrow=len(img_list) // len(info_dict['prompts']))
+    # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
+    ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to(
+        'cpu', torch.uint8).numpy()
+    im = Image.fromarray(ndarr)
+    save_name = f"{info_dict['sample_args'].pop()}---{info_dict['suffix'].pop()}.jpg"
+    im.save(osp.join(osp.dirname(dir_path), save_name))

orthogonal_mats/1280.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cacbfe6dda3140404a86019e487349f7b693667cd7efe5848fe9fc04b1a3618
+size 13107328

orthogonal_mats/320.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99b8a3b3cf101ac83f43eda74392447207bdc4745e8e77626219d994ea8f2ae9
+size 819328

orthogonal_mats/640.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c3c38d378cf4c22f7ab4cd3fb734c846afd4500ca58f253cb63c44304c360aa
+size 3276928

orthogonal_mats/768.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64bd95b552140a47665e46cb5736cd8562e40b5f27f0f262a4b7563d13061daf
+size 4718720