ELITE-library
/

ELITE

Model card Files Files and versions Community

csyxwei commited on Mar 8, 2023

Commit

0792c6b

1 Parent(s): e942d32

elite code init

Browse files

Files changed (38) hide show

.gitignore +21 -0
2_gpu.json +11 -0
3_gpu.json +11 -0
4_gpu.json +11 -0
LICENSE +201 -0
README.md +3 -3
datasets.py +541 -0
elite.yaml +147 -0
inference_global.py +214 -0
inference_global.sh +12 -0
inference_local.py +247 -0
inference_local.sh +12 -0
test_datasets/1.jpg +0 -0
test_datasets/10.jpg +0 -0
test_datasets/10_bg.png +0 -0
test_datasets/11.jpg +0 -0
test_datasets/11_bg.png +0 -0
test_datasets/15.jpg +0 -0
test_datasets/15_bg.png +0 -0
test_datasets/16.jpg +0 -0
test_datasets/16_bg.png +0 -0
test_datasets/17.jpg +0 -0
test_datasets/17_bg.png +0 -0
test_datasets/1_bg.png +0 -0
test_datasets/2.jpg +0 -0
test_datasets/20.jpg +0 -0
test_datasets/20_bg.png +0 -0
test_datasets/2_bg.png +0 -0
test_datasets/3.jpg +0 -0
test_datasets/3_bg.png +0 -0
test_datasets/4.png +0 -0
test_datasets/4_bg.png +0 -0
test_datasets/7.jpg +0 -0
test_datasets/7_bg.png +0 -0
train_global.py +715 -0
train_global.sh +15 -0
train_local.py +709 -0
train_local.sh +16 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,21 @@

+_debug*
+.env
+__pycache__
+_sc.py
+*.ckpt
+*.bin
+checkpoints
+.idea
+.idea/workspace.xml
+.DS_Store
+*/__pycache__git
+.pyc
+.iml
+__pycache__/
+*/__pycache__/
+*/*/__pycache__/
+*/*/*/__pycache__/
+*/*/*/*/__pycache__/
+*/*/*/*/*/__pycache__/
+*/*/*/*/*/*/__pycache__/

2_gpu.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "compute_environment": "LOCAL_MACHINE",
+  "distributed_type": "MULTI_GPU",
+  "fp16": false,
+  "machine_rank": 0,
+  "main_process_ip": null,
+  "main_process_port": null,
+  "main_training_function": "main",
+  "num_machines": 1,
+  "num_processes": 2
+}

3_gpu.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "compute_environment": "LOCAL_MACHINE",
+  "distributed_type": "MULTI_GPU",
+  "fp16": false,
+  "machine_rank": 0,
+  "main_process_ip": null,
+  "main_process_port": null,
+  "main_training_function": "main",
+  "num_machines": 1,
+  "num_processes": 3
+}

4_gpu.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "compute_environment": "LOCAL_MACHINE",
+  "distributed_type": "MULTI_GPU",
+  "fp16": false,
+  "machine_rank": 0,
+  "main_process_ip": null,
+  "main_process_port": null,
+  "main_training_function": "main",
+  "num_machines": 1,
+  "num_processes": 4
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,3 @@
----
-license: apache-2.0
----


1	+ # ELITE
2	+
3	+ The detailed README is coming soom.

datasets.py ADDED Viewed

	@@ -0,0 +1,541 @@

+from packaging import version
+from PIL import Image
+from torchvision import transforms
+import os
+import PIL
+from torch.utils.data import Dataset
+import torchvision
+import numpy as np
+import torch
+import random
+import albumentations as A
+import copy
+import cv2
+import pandas as pd
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+def is_image(file):
+    return 'jpg' in file.lower()  or 'png' in file.lower()  or 'jpeg' in file.lower()
+class CustomDatasetWithBG(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        size=512,
+        interpolation="bicubic",
+        placeholder_token="*",
+        template="a photo of a {}",
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.image_paths = []
+        self.image_paths += [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root) if is_image(file_path) and not 'bg' in file_path]
+        self.image_paths = sorted(self.image_paths)
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+        self.template = template
+    def __len__(self):
+        return self._length
+    def get_tensor_clip(self, normalize=True, toTensor=True):
+        transform_list = []
+        if toTensor:
+            transform_list += [torchvision.transforms.ToTensor()]
+        if normalize:
+            transform_list += [torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                                                (0.26862954, 0.26130258, 0.27577711))]
+        return torchvision.transforms.Compose(transform_list)
+    def process(self, image):
+        img = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+        img = np.array(img).astype(np.float32)
+        img = img / 127.5 - 1.0
+        return torch.from_numpy(img).permute(2, 0, 1)
+    def __getitem__(self, i):
+        example = {}
+        placeholder_string = self.placeholder_token
+        text = self.template.format(placeholder_string)
+        example["text"] = text
+        placeholder_index = 0
+        words = text.strip().split(' ')
+        for idx, word in enumerate(words):
+            if word == placeholder_string:
+                placeholder_index = idx + 1
+        example["index"] = torch.tensor(placeholder_index)
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+        image = Image.open(self.image_paths[i % self.num_images])
+        mask_path = self.image_paths[i % self.num_images].replace('.jpeg', '.png').replace('.jpg', '.png').replace('.JPEG', '.png')[:-4] + '_bg.png'
+        mask = np.array(Image.open(mask_path)) / 255.0
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        image_np = np.array(image)
+        object_tensor = image_np * mask
+        example["pixel_values"] = self.process(image_np)
+        ref_object_tensor = Image.fromarray(object_tensor.astype('uint8')).resize((224, 224), resample=self.interpolation)
+        ref_image_tenser = Image.fromarray(image_np.astype('uint8')).resize((224, 224), resample=self.interpolation)
+        example["pixel_values_obj"] = self.get_tensor_clip()(ref_object_tensor)
+        example["pixel_values_clip"] = self.get_tensor_clip()(ref_image_tenser)
+        ref_seg_tensor = Image.fromarray(mask.astype('uint8') * 255)
+        ref_seg_tensor = self.get_tensor_clip(normalize=False)(ref_seg_tensor)
+        example["pixel_values_seg"] = torch.nn.functional.interpolate(ref_seg_tensor.unsqueeze(0), size=(128, 128), mode='nearest').squeeze(0)
+        return example
+class OpenImagesDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        size=512,
+        interpolation="bicubic",
+        set="train",
+        placeholder_token="*",
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.set_type = set
+        self.random_trans = A.Compose([
+            A.Resize(height=224, width=224),
+            A.HorizontalFlip(p=0.5),
+            A.Rotate(limit=20),
+            A.Blur(p=0.3),
+            A.ElasticTransform(p=0.3)
+        ])
+        self.bbox_path_list = []
+        if set == "train":
+            bboxs_path = os.path.join(data_root, 'annotations', f'oidv6-train-annotations-bbox.csv')
+        elif set == "validation":
+            bboxs_path = os.path.join(data_root, 'annotations', f'validation-annotations-bbox.csv')
+        else:
+            bboxs_path = os.path.join(data_root, 'annotations', f'test-annotations-bbox.csv')
+        df_val_bbox = pd.read_csv(bboxs_path)
+        bbox_groups = df_val_bbox.groupby(df_val_bbox.LabelName)
+        bbox_full = []
+        for label_name in df_val_bbox['LabelName'].unique():
+            bboxs = bbox_groups.get_group(label_name)[
+                ['XMin', 'XMax', 'YMin', 'YMax', 'LabelName', 'ImageID',
+                 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsInside']].values.tolist()
+            bboxs_new = []
+            for bbox in bboxs:
+                if not ((bbox[1] - bbox[0]) * (bbox[3] - bbox[2]) > 0.8 or (bbox[1] - bbox[0]) * (
+                        bbox[3] - bbox[2]) < 0.02):
+                    bboxs_new.append([bbox[0], bbox[1], bbox[2], bbox[3], bbox[4], bbox[5]])
+            bbox_full.extend(bboxs_new)
+        self.bboxs_full = bbox_full
+        self.num_images = len(bbox_full)
+        print('{}: total {} images ...'.format(set, self.num_images))
+        self._length = self.num_images
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+        self.templates = imagenet_templates_small
+    def __len__(self):
+        return self._length
+    def get_tensor_clip(self, normalize=True, toTensor=True):
+        transform_list = []
+        if toTensor:
+            transform_list += [torchvision.transforms.ToTensor()]
+        if normalize:
+            transform_list += [torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                                                (0.26862954, 0.26130258, 0.27577711))]
+        return torchvision.transforms.Compose(transform_list)
+    def process(self, image):
+        img = np.array(image)
+        img = cv2.resize(img, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+        img = np.array(img).astype(np.float32)
+        img = img / 127.5 - 1.0
+        return torch.from_numpy(img).permute(2, 0, 1)
+    def obtain_text(self, add_caption, object_category=None):
+        if object_category is None:
+            placeholder_string = self.placeholder_token
+        else:
+            placeholder_string = object_category
+        text = random.choice(self.templates).format(placeholder_string)
+        text = add_caption + text[1:]
+        placeholder_index = 0
+        words = text.strip().split(' ')
+        for idx, word in enumerate(words):
+            if word == placeholder_string:
+                placeholder_index = idx + 1
+        index = torch.tensor(placeholder_index)
+        input_ids = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+        return input_ids, index, text
+    def __getitem__(self, i):
+        example = {}
+        input_ids, index, text = self.obtain_text('a')
+        example["input_ids"] = input_ids
+        example["index"] = index
+        example["text"] = text
+        bbox_sample = self.bboxs_full[i % self.num_images]
+        bbox_sample = copy.copy(bbox_sample)
+        file_name = bbox_sample[-1] + '.jpg'
+        img_path = os.path.join(self.data_root, 'images', self.set_type, file_name)
+        try:
+            img_p = Image.open(img_path).convert("RGB")
+            img_p_np = np.array(img_p)
+            bbox_sample[0] *= int(img_p_np.shape[1])
+            bbox_sample[1] *= int(img_p_np.shape[1])
+            bbox_sample[2] *= int(img_p_np.shape[0])
+            bbox_sample[3] *= int(img_p_np.shape[0])
+            bbox_pad = copy.copy(bbox_sample)
+            bbox_pad[0] = int(bbox_sample[0] - min(10, bbox_sample[0] - 0))
+            bbox_pad[1] = int(bbox_sample[1] + min(10, img_p.size[0] - bbox_sample[1]))
+            bbox_pad[2] = int(bbox_sample[2] - min(10, bbox_sample[2] - 0))
+            bbox_pad[3] = int(bbox_sample[3] + min(10, img_p.size[1] - bbox_sample[3]))
+            image_tensor = img_p_np[bbox_pad[2]:bbox_pad[3], bbox_pad[0]:bbox_pad[1], :]
+            example["pixel_values"] = self.process(image_tensor)
+            ref_image_tensor = self.random_trans(image=image_tensor)
+            ref_image_tensor = Image.fromarray(ref_image_tensor["image"])
+            example["pixel_values_clip"] = self.get_tensor_clip()(ref_image_tensor)
+        except Exception as e:
+            example["pixel_values"] = torch.zeros((3, 512, 512))
+            example["pixel_values_clip"] = torch.zeros((3, 224, 224))
+            with open('error.txt', 'a+') as f:
+                f.write(str(e) + '\n')
+        return example
+class OpenImagesDatasetWithMask(OpenImagesDataset):
+    def __init__(self,
+             data_root,
+             tokenizer,
+             size=512,
+             interpolation="bicubic",
+             set="train",
+             placeholder_token="*"):
+        # super().__init__(data_root, tokenizer, size, interpolation, set, placeholder_token)
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.set = set
+        class_anno_path = os.path.join(data_root, 'annotations', f'oidv6-class-descriptions.csv')
+        anno_files = pd.read_csv(class_anno_path)
+        class_groups = anno_files.groupby(anno_files.LabelName)
+        if set == "train":
+            bboxs_path = os.path.join(data_root, 'annotations', f'train-annotations-object-segmentation.csv')
+            dict_path = os.path.join(data_root, 'segs', f'train_bbox_dict.npy')
+        elif set == "validation":
+            bboxs_path = os.path.join(data_root, 'annotations', f'validation-annotations-object-segmentation.csv')
+            dict_path = os.path.join(data_root, 'segs', f'validation_bbox_dict.npy')
+        else:
+            bboxs_path = os.path.join(data_root, 'annotations', f'test-annotations-object-segmentation.csv')
+            dict_path = os.path.join(data_root, 'segs', f'test_bbox_dict.npy')
+        bbox_dict = np.load(dict_path, allow_pickle=True).item()
+        df_val_bbox = pd.read_csv(bboxs_path)
+        bbox_groups = df_val_bbox.groupby(df_val_bbox.LabelName)
+        bboxes_full = []
+        for label_name in df_val_bbox['LabelName'].unique():
+            bboxs = bbox_groups.get_group(label_name)[
+                ['BoxXMin', 'BoxXMax', 'BoxYMin', 'BoxYMax', 'LabelName', 'MaskPath']].values.tolist()
+            bboxes_new = []
+            for box in bboxs:
+                if not box[-1] in bbox_dict:
+                    continue
+                bbox_data = bbox_dict[box[-1]]
+                if (bbox_data[2] - bbox_data[1]) < 100 or (bbox_data[4] - bbox_data[3]) < 100:
+                    continue
+                if not ((bbox_data[2] - bbox_data[1]) / (bbox_data[4] - bbox_data[3]) < 0.5 or (
+                        bbox_data[4] - bbox_data[3]) / ( bbox_data[2] - bbox_data[1]) < 0.5):
+                    class_name = class_groups.get_group(box[4])[['DisplayName']].values.tolist()[0][0]
+                    bboxes_new.append([box[-1], bbox_data[1], bbox_data[2], bbox_data[3], bbox_data[4], class_name])
+            bboxes_full.extend(bboxes_new)
+        self.bboxes_full = bboxes_full
+        self.num_images = len(bboxes_full)
+        print('{}: total {} images ...'.format(set, self.num_images))
+        self._length = self.num_images
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+        self.templates = imagenet_templates_small
+    def __len__(self):
+        return self._length
+    ## borrowed from custom diffusion
+    def custom_aug(self, instance_image):
+        instance_image = Image.fromarray(instance_image)
+        #### apply augmentation and create a valid image regions mask ####
+        if np.random.randint(0, 3) < 2:
+            random_scale = np.random.randint(self.size // 3, self.size + 1)
+        else:
+            random_scale = np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
+        if random_scale % 2 == 1:
+            random_scale += 1
+        if random_scale < 0.6 * self.size:
+            add_to_caption = np.random.choice(["a far away", "very small"])
+            cx = np.random.randint(random_scale // 2, self.size - random_scale // 2 + 1)
+            cy = np.random.randint(random_scale // 2, self.size - random_scale // 2 + 1)
+            instance_image1 = instance_image.resize((random_scale, random_scale), resample=self.interpolation)
+            instance_image1 = np.array(instance_image1).astype(np.uint8)
+            instance_image1 = (instance_image1 / 127.5 - 1.0).astype(np.float32)
+            instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32)
+            instance_image[cx - random_scale // 2: cx + random_scale // 2,
+            cy - random_scale // 2: cy + random_scale // 2, :] = instance_image1
+            mask = np.zeros((self.size // 8, self.size // 8))
+            mask[(cx - random_scale // 2) // 8 + 1: (cx + random_scale // 2) // 8 - 1,
+            (cy - random_scale // 2) // 8 + 1: (cy + random_scale // 2) // 8 - 1] = 1.
+        elif random_scale > self.size:
+            add_to_caption = np.random.choice(["zoomed in", "close up"])
+            cx = np.random.randint(self.size // 2, random_scale - self.size // 2 + 1)
+            cy = np.random.randint(self.size // 2, random_scale - self.size // 2 + 1)
+            instance_image = instance_image.resize((random_scale, random_scale), resample=self.interpolation)
+            instance_image = np.array(instance_image).astype(np.uint8)
+            instance_image = (instance_image / 127.5 - 1.0).astype(np.float32)
+            instance_image = instance_image[cx - self.size // 2: cx + self.size // 2,
+                             cy - self.size // 2: cy + self.size // 2, :]
+            mask = np.ones((self.size // 8, self.size // 8))
+        else:
+            add_to_caption = "a"
+            if self.size is not None:
+                instance_image = instance_image.resize((self.size, self.size), resample=self.interpolation)
+            instance_image = np.array(instance_image).astype(np.uint8)
+            instance_image = (instance_image / 127.5 - 1.0).astype(np.float32)
+            mask = np.ones((self.size // 8, self.size // 8))
+        return torch.from_numpy(instance_image).permute(2, 0, 1), torch.from_numpy(mask[:, :, None]).permute(2, 0, 1), add_to_caption
+    def aug_cv2(self, img, seg):
+        img_auged = np.array(img).copy()
+        seg_auged = np.array(seg).copy()
+        # resize and crop
+        if random.choice([0, 1]) == 0:
+            new_size = random.randint(224, 256)
+            img_auged = cv2.resize(img_auged, (new_size, new_size), interpolation=cv2.INTER_CUBIC)
+            seg_auged = cv2.resize(seg_auged, (new_size, new_size), interpolation=cv2.INTER_NEAREST)
+            start_x, start_y = random.randint(0, new_size - 224), random.randint(0, new_size - 224)
+            img_auged = img_auged[start_x:start_x + 224, start_y:start_y + 224, :]
+            seg_auged = seg_auged[start_x:start_x + 224, start_y:start_y + 224, :]
+        h, w = img_auged.shape[:2]
+        # rotate
+        if random.choice([0, 1]) == 0:
+            # print('rotate')
+            angle = random.randint(-30, 30)
+            M = cv2.getRotationMatrix2D((112, 112), angle, 1)
+            img_auged = cv2.warpAffine(img_auged, M, (w, h), flags=cv2.INTER_CUBIC)
+            seg_auged = cv2.warpAffine(seg_auged, M, (w, h), flags=cv2.INTER_NEAREST)
+        # translation
+        if random.choice([0, 1]) == 0:
+            trans_x = random.randint(-60, 60)
+            trans_y = random.randint(-60, 60)
+            H = np.float32([[1, 0, trans_x],
+                            [0, 1, trans_y]])
+            img_auged = cv2.warpAffine(img_auged, H, (w, h), flags=cv2.INTER_CUBIC)
+            seg_auged = cv2.warpAffine(seg_auged, H, (w, h), flags=cv2.INTER_NEAREST)
+        img_auged = Image.fromarray(img_auged)
+        seg_auged = Image.fromarray(seg_auged)
+        return img_auged, seg_auged
+    def __getitem__(self, i):
+        example = {}
+        seg_name = self.bboxes_full[i % self.num_images][0]
+        file_name = seg_name.split('_')[0] + '.jpg'
+        img_path = os.path.join(self.data_root, 'images', self.set, file_name)
+        seg_path = os.path.join(self.data_root, 'segs', self.set, seg_name)
+        try:
+            # crop image and mask
+            bbox_sample = self.bboxes_full[i % self.num_images][1:]
+            img_p_np = cv2.imread(img_path)
+            img_p_np = cv2.cvtColor(img_p_np, cv2.COLOR_BGR2RGB)
+            seg_p_np = cv2.imread(seg_path).astype('float')
+            seg_p_np = cv2.resize(seg_p_np, img_p_np.shape[:2][::-1], interpolation=cv2.INTER_NEAREST)
+            bbox_pad = copy.copy(bbox_sample)
+            pad_size = random.choice(list(range(10, 20)))
+            bbox_pad[0] = int(bbox_pad[0] - min(pad_size, bbox_pad[0] - 0))
+            bbox_pad[1] = int(bbox_pad[1] + pad_size)
+            bbox_pad[2] = int(bbox_pad[2] - min(pad_size, bbox_pad[2] - 0))
+            bbox_pad[3] = int(bbox_pad[3] + pad_size)
+            image_tensor = img_p_np[bbox_pad[0]:bbox_pad[1], bbox_pad[2]:bbox_pad[3], :]
+            seg_tensor = seg_p_np[bbox_pad[0]:bbox_pad[1], bbox_pad[2]:bbox_pad[3], :]
+            # augmentation for input image
+            augged_image, augged_mask, add_caption = self.custom_aug(image_tensor)
+            input_ids, index, text = self.obtain_text(add_caption)
+            example["pixel_values"] = augged_image
+            example["mask_values"] = augged_mask
+            example["input_ids"] = input_ids
+            example["index"] = index
+            example["text"] = text
+            object_tensor = image_tensor * (seg_tensor / 255)
+            ref_object_tensor = cv2.resize(object_tensor, (224, 224), interpolation=cv2.INTER_CUBIC)
+            ref_image_tenser = cv2.resize(image_tensor, (224, 224), interpolation=cv2.INTER_CUBIC)
+            ref_seg_tensor = cv2.resize(seg_tensor, (224, 224), interpolation=cv2.INTER_NEAREST)
+            ref_object_tensor, ref_seg_tensor = self.aug_cv2(ref_object_tensor.astype('uint8'), ref_seg_tensor.astype('uint8'))
+            example["pixel_values_clip"] = self.get_tensor_clip()(Image.fromarray(ref_image_tenser))
+            example["pixel_values_obj"] = self.get_tensor_clip()(ref_object_tensor)
+            example["pixel_values_seg"] = self.get_tensor_clip(normalize=False)(ref_seg_tensor)
+        except Exception as e:
+            example["pixel_values"] = torch.zeros((3, 512, 512))
+            example["pixel_values_obj"] = torch.zeros((3, 224, 224))
+            example["pixel_values_clip"] = torch.zeros((3, 224, 224))
+            example["pixel_values_seg"] = torch.zeros((3, 224, 224))
+            input_ids, index, text = self.obtain_text("a")
+            example["input_ids"] = input_ids
+            example["index"] = index
+            example["text"] = text
+            with open('error.txt', 'a+') as f:
+                f.write(str(e) + '\n')
+        return example

elite.yaml ADDED Viewed

	@@ -0,0 +1,147 @@

+name: elite
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - ca-certificates=2022.10.11=h06a4308_0
+  - certifi=2022.9.24=py39h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - ncurses=6.3=h7f8727e_2
+  - openssl=1.1.1s=h7f8727e_0
+  - pip=22.2.2=py39h06a4308_0
+  - python=3.9.12=h12debd9_1
+  - readline=8.1.2=h7f8727e_1
+  - sqlite=3.38.5=hc218d9a_0
+  - tk=8.6.12=h1ccaba5_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7f8727e_1
+  - zlib=1.2.12=h7f8727e_2
+  - pip:
+    - absl-py==1.3.0
+    - accelerate==0.15.0
+    - aiohttp==3.8.3
+    - aiosignal==1.3.1
+    - albumentations==1.1.0
+    - altair==4.2.0
+    - antlr4-python3-runtime==4.8
+    - async-timeout==4.0.2
+    - attrs==22.1.0
+    - blinker==1.5
+    - cachetools==5.2.0
+    - charset-normalizer==2.1.1
+    - click==8.1.3
+    - commonmark==0.9.1
+    - contourpy==1.0.6
+    - cycler==0.11.0
+    - cython==0.29.33
+    - decorator==5.1.1
+    - diffusers==0.11.1
+    - einops==0.4.1
+    - emoji==2.2.0
+    - entrypoints==0.4
+    - faiss-gpu==1.7.2
+    - filelock==3.8.0
+    - fonttools==4.38.0
+    - frozenlist==1.3.3
+    - fsspec==2022.11.0
+    - ftfy==6.1.1
+    - future==0.18.2
+    - gitdb==4.0.9
+    - gitpython==3.1.29
+    - google-auth==2.14.1
+    - google-auth-oauthlib==0.4.6
+    - grpcio==1.50.0
+    - huggingface-hub==0.11.0
+    - idna==3.4
+    - imageio==2.14.1
+    - imageio-ffmpeg==0.4.7
+    - importlib-metadata==5.0.0
+    - jinja2==3.1.2
+    - joblib==1.2.0
+    - jsonschema==4.17.0
+    - kiwisolver==1.4.4
+    - kornia==0.6.0
+    - markdown==3.4.1
+    - markupsafe==2.1.1
+    - matplotlib==3.6.2
+    - multidict==6.0.2
+    - networkx==2.8.8
+    - nltk==3.7
+    - numpy==1.23.4
+    - oauthlib==3.2.2
+    - omegaconf==2.1.1
+    - opencv-python==4.6.0.66
+    - opencv-python-headless==4.6.0.66
+    - packaging==21.3
+    - pandas==1.5.1
+    - pillow==9.0.1
+    - protobuf==3.20.1
+    - psutil==5.9.4
+    - pudb==2019.2
+    - pyarrow==10.0.0
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pycocotools==2.0.6
+    - pydeck==0.8.0
+    - pydensecrf==1.0rc2
+    - pydeprecate==0.3.2
+    - pygments==2.13.0
+    - pympler==1.0.1
+    - pyparsing==3.0.9
+    - pyrsistent==0.19.2
+    - python-dateutil==2.8.2
+    - python-dotenv==0.21.0
+    - pytorch-lightning==1.6.5
+    - pytz==2022.6
+    - pytz-deprecation-shim==0.1.0.post0
+    - pywavelets==1.4.1
+    - pyyaml==6.0
+    - qudida==0.0.4
+    - regex==2022.10.31
+    - requests==2.28.1
+    - requests-oauthlib==1.3.1
+    - rich==12.6.0
+    - rsa==4.9
+    - sacremoses==0.0.53
+    - scikit-image==0.19.3
+    - scikit-learn==1.1.3
+    - scipy==1.9.3
+    - semver==2.13.0
+    - setuptools==59.5.0
+    - six==1.16.0
+    - smmap==5.0.0
+    - stanza==1.4.2
+    - streamlit==1.15.0
+    - tensorboard==2.11.0
+    - tensorboard-data-server==0.6.1
+    - tensorboard-plugin-wit==1.8.1
+    - test-tube==0.7.5
+    - threadpoolctl==3.1.0
+    - tifffile==2022.10.10
+    - timm==0.6.12
+    - tokenizers==0.12.1
+    - toml==0.10.2
+    - toolz==0.12.0
+    - torch==1.12.1+cu116
+    - torch-fidelity==0.3.0
+    - torchaudio==0.12.1+cu116
+    - torchmetrics==0.6.0
+    - torchvision==0.13.1+cu116
+    - tornado==6.2
+    - tqdm==4.64.1
+    - transformers==4.25.1
+    - typing-extensions==4.4.0
+    - tzdata==2022.6
+    - tzlocal==4.2
+    - urllib3==1.26.12
+    - urwid==2.1.2
+    - validators==0.20.0
+    - watchdog==2.1.9
+    - wcwidth==0.2.5
+    - werkzeug==2.2.2
+    - yarl==1.8.1
+    - zipp==3.10.0

inference_global.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+from typing import Optional, Tuple
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
+from PIL import Image
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModel
+from train_global import Mapper, th2image
+from train_global import inj_forward_text, inj_forward_crossattention, validation
+import torch.nn as nn
+from datasets import CustomDatasetWithBG
+def _pil_from_latents(vae, latents):
+    _latents = 1 / 0.18215 * latents.clone()
+    image = vae.decode(_latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    ret_pil_images = [Image.fromarray(image) for image in images]
+    return ret_pil_images
+def pww_load_tools(
+    device: str = "cuda:0",
+    scheduler_type=LMSDiscreteScheduler,
+    mapper_model_path: Optional[str] = None,
+    diffusion_model_path: Optional[str] = None,
+    model_token: Optional[str] = None,
+) -> Tuple[
+    UNet2DConditionModel,
+    CLIPTextModel,
+    CLIPTokenizer,
+    AutoencoderKL,
+    CLIPVisionModel,
+    Mapper,
+    LMSDiscreteScheduler,
+]:
+    # 'CompVis/stable-diffusion-v1-4'
+    local_path_only = diffusion_model_path is not None
+    vae = AutoencoderKL.from_pretrained(
+        diffusion_model_path,
+        subfolder="vae",
+        use_auth_token=model_token,
+        torch_dtype=torch.float16,
+        local_files_only=local_path_only,
+    )
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16,)
+    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16,)
+    image_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16,)
+    # Load models and create wrapper for stable diffusion
+    for _module in text_encoder.modules():
+        if _module.__class__.__name__ == "CLIPTextTransformer":
+            _module.__class__.__call__ = inj_forward_text
+    unet = UNet2DConditionModel.from_pretrained(
+        diffusion_model_path,
+        subfolder="unet",
+        use_auth_token=model_token,
+        torch_dtype=torch.float16,
+        local_files_only=local_path_only,
+    )
+    mapper = Mapper(input_dim=1024, output_dim=768)
+    for _name, _module in unet.named_modules():
+        if _module.__class__.__name__ == "CrossAttention":
+            if 'attn1' in _name: continue
+            _module.__class__.__call__ = inj_forward_crossattention
+            shape = _module.to_k.weight.shape
+            to_k_global = nn.Linear(shape[1], shape[0], bias=False)
+            mapper.add_module(f'{_name.replace(".", "_")}_to_k', to_k_global)
+            shape = _module.to_v.weight.shape
+            to_v_global = nn.Linear(shape[1], shape[0], bias=False)
+            mapper.add_module(f'{_name.replace(".", "_")}_to_v', to_v_global)
+    mapper.load_state_dict(torch.load(mapper_model_path, map_location='cpu'))
+    mapper.half()
+    for _name, _module in unet.named_modules():
+        if 'attn1' in _name: continue
+        if _module.__class__.__name__ == "CrossAttention":
+            _module.add_module('to_k_global', mapper.__getattr__(f'{_name.replace(".", "_")}_to_k'))
+            _module.add_module('to_v_global', mapper.__getattr__(f'{_name.replace(".", "_")}_to_v'))
+    vae.to(device), unet.to(device), text_encoder.to(device), image_encoder.to(device), mapper.to(device)
+    scheduler = scheduler_type(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+    vae.eval()
+    unet.eval()
+    image_encoder.eval()
+    text_encoder.eval()
+    mapper.eval()
+    return vae, unet, text_encoder, tokenizer, image_encoder, mapper, scheduler
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--token_index",
+        type=str,
+        default="full",
+        help="Selected index for word embedding.",
+    )
+    parser.add_argument(
+        "--global_mapper_path",
+        type=str,
+        required=True,
+        help="Path to pretrained global mapping network.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default='outputs',
+        help="The output directory where the model predictions will be written.",
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default="S",
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--template",
+        type=str,
+        default="a photo of a {}",
+        help="Text template for customized genetation.",
+    )
+    parser.add_argument(
+        "--test_data_dir", type=str, default=None, required=True, help="A folder containing the testing data."
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--suffix",
+        type=str,
+        default="object",
+        help="Suffix of save directory.",
+    )
+    parser.add_argument(
+        "--selected_data",
+        type=int,
+        default=-1,
+        help="Data index. -1 for all.",
+    )
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    save_dir = os.path.join(args.output_dir, f'{args.suffix}_token{args.token_index}')
+    os.makedirs(save_dir, exist_ok=True)
+    vae, unet, text_encoder, tokenizer, image_encoder, mapper, scheduler = pww_load_tools(
+            "cuda:0",
+            LMSDiscreteScheduler,
+            diffusion_model_path=args.pretrained_model_name_or_path,
+            mapper_model_path=args.global_mapper_path,
+        )
+    train_dataset = CustomDatasetWithBG(
+        data_root=args.test_data_dir,
+        tokenizer=tokenizer,
+        size=512,
+        placeholder_token=args.placeholder_token,
+        template=args.template,
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False)
+    for step, batch in enumerate(train_dataloader):
+        if args.selected_data > -1 and step != args.selected_data:
+            continue
+        batch["pixel_values"] = batch["pixel_values"].to("cuda:0")
+        batch["pixel_values_clip"] = batch["pixel_values_clip"].to("cuda:0").half()
+        batch["input_ids"] = batch["input_ids"].to("cuda:0")
+        batch["index"] = batch["index"].to("cuda:0").long()
+        print(step, batch['text'])
+        seeds = [0, 42, 10086, 777, 555, 222, 111, 999, 327, 283, 190, 218, 2371, 9329, 2938, 2073, 27367, 293,
+                 8269, 87367, 29379, 4658, 39, 598]
+        seeds = sorted(seeds)
+        for seed in seeds:
+            syn_images = validation(batch, tokenizer, image_encoder, text_encoder, unet, mapper, vae, batch["pixel_values_clip"].device, 5,
+                                    token_index=args.token_index, seed=seed)
+            concat = np.concatenate((np.array(syn_images[0]), th2image(batch["pixel_values"][0])), axis=1)
+            Image.fromarray(concat).save(os.path.join(save_dir, f'{str(step).zfill(5)}_{str(seed).zfill(5)}.jpg'))

inference_global.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR='./test_datasets/'
+CUDA_VISIBLE_DEVICES=6 python inference_global.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --test_data_dir=$DATA_DIR \
+  --output_dir="./outputs/global_mapping"  \
+  --suffix="object" \
+  --token_index="0" \
+  --template="a photo of a {}" \
+  --global_mapper_path="./checkpoints/global_mapper.pt"

inference_local.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import os
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModel
+from train_local import Mapper, th2image, MapperLocal
+from train_local import inj_forward_text, inj_forward_crossattention, validation
+import torch.nn as nn
+from datasets import CustomDatasetWithBG
+def _pil_from_latents(vae, latents):
+    _latents = 1 / 0.18215 * latents.clone()
+    image = vae.decode(_latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    ret_pil_images = [Image.fromarray(image) for image in images]
+    return ret_pil_images
+def pww_load_tools(
+    device: str = "cuda:0",
+    scheduler_type=LMSDiscreteScheduler,
+    mapper_model_path: Optional[str] = None,
+    mapper_local_model_path: Optional[str] = None,
+    diffusion_model_path: Optional[str] = None,
+    model_token: Optional[str] = None,
+) -> Tuple[
+    UNet2DConditionModel,
+    CLIPTextModel,
+    CLIPTokenizer,
+    AutoencoderKL,
+    CLIPVisionModel,
+    Mapper,
+    MapperLocal,
+    LMSDiscreteScheduler,
+]:
+    # 'CompVis/stable-diffusion-v1-4'
+    local_path_only = diffusion_model_path is not None
+    vae = AutoencoderKL.from_pretrained(
+        diffusion_model_path,
+        subfolder="vae",
+        use_auth_token=model_token,
+        torch_dtype=torch.float16,
+        local_files_only=local_path_only,
+    )
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16,)
+    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16,)
+    image_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16,)
+    # Load models and create wrapper for stable diffusion
+    for _module in text_encoder.modules():
+        if _module.__class__.__name__ == "CLIPTextTransformer":
+            _module.__class__.__call__ = inj_forward_text
+    unet = UNet2DConditionModel.from_pretrained(
+        diffusion_model_path,
+        subfolder="unet",
+        use_auth_token=model_token,
+        torch_dtype=torch.float16,
+        local_files_only=local_path_only,
+    )
+    inj_forward_crossattention
+    mapper = Mapper(input_dim=1024, output_dim=768)
+    mapper_local = MapperLocal(input_dim=1024, output_dim=768)
+    for _name, _module in unet.named_modules():
+        if _module.__class__.__name__ == "CrossAttention":
+            if 'attn1' in _name: continue
+            _module.__class__.__call__ = inj_forward_crossattention
+            shape = _module.to_k.weight.shape
+            to_k_global = nn.Linear(shape[1], shape[0], bias=False)
+            mapper.add_module(f'{_name.replace(".", "_")}_to_k', to_k_global)
+            shape = _module.to_v.weight.shape
+            to_v_global = nn.Linear(shape[1], shape[0], bias=False)
+            mapper.add_module(f'{_name.replace(".", "_")}_to_v', to_v_global)
+            to_v_local = nn.Linear(shape[1], shape[0], bias=False)
+            mapper_local.add_module(f'{_name.replace(".", "_")}_to_v', to_v_local)
+            to_k_local = nn.Linear(shape[1], shape[0], bias=False)
+            mapper_local.add_module(f'{_name.replace(".", "_")}_to_k', to_k_local)
+    mapper.load_state_dict(torch.load(mapper_model_path, map_location='cpu'))
+    mapper.half()
+    mapper_local.load_state_dict(torch.load(mapper_local_model_path, map_location='cpu'))
+    mapper_local.half()
+    for _name, _module in unet.named_modules():
+        if 'attn1' in _name: continue
+        if _module.__class__.__name__ == "CrossAttention":
+            _module.add_module('to_k_global', mapper.__getattr__(f'{_name.replace(".", "_")}_to_k'))
+            _module.add_module('to_v_global', mapper.__getattr__(f'{_name.replace(".", "_")}_to_v'))
+            _module.add_module('to_v_local', getattr(mapper_local, f'{_name.replace(".", "_")}_to_v'))
+            _module.add_module('to_k_local', getattr(mapper_local, f'{_name.replace(".", "_")}_to_k'))
+    vae.to(device), unet.to(device), text_encoder.to(device), image_encoder.to(device), mapper.to(device), mapper_local.to(device)
+    scheduler = scheduler_type(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+    vae.eval()
+    unet.eval()
+    image_encoder.eval()
+    text_encoder.eval()
+    mapper.eval()
+    mapper_local.eval()
+    return vae, unet, text_encoder, tokenizer, image_encoder, mapper, mapper_local, scheduler
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--global_mapper_path",
+        type=str,
+        required=True,
+        help="Path to pretrained global mapping network.",
+    )
+    parser.add_argument(
+        "--local_mapper_path",
+        type=str,
+        required=True,
+        help="Path to pretrained local mapping network.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default='outputs',
+        help="The output directory where the model predictions will be written.",
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default="S",
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--template",
+        type=str,
+        default="a photo of a {}",
+        help="Text template for customized genetation.",
+    )
+    parser.add_argument(
+        "--test_data_dir", type=str, default=None, required=True, help="A folder containing the testing data."
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--suffix",
+        type=str,
+        default="object",
+        help="Suffix of save directory.",
+    )
+    parser.add_argument(
+        "--selected_data",
+        type=int,
+        default=-1,
+        help="Data index. -1 for all.",
+    )
+    parser.add_argument(
+        "--llambda",
+        type=str,
+        default="0.8",
+        help="Lambda for fuse the global and local feature.",
+    )
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    save_dir = os.path.join(args.output_dir, f'{args.suffix}_l{args.llambda.replace(".", "p")}')
+    os.makedirs(save_dir, exist_ok=True)
+    vae, unet, text_encoder, tokenizer, image_encoder, mapper, mapper_local, scheduler = pww_load_tools(
+            "cuda:0",
+            LMSDiscreteScheduler,
+            diffusion_model_path=args.pretrained_model_name_or_path,
+            mapper_model_path=args.global_mapper_path,
+            mapper_local_model_path=args.local_mapper_path,
+        )
+    train_dataset = CustomDatasetWithBG(
+        data_root=args.test_data_dir,
+        tokenizer=tokenizer,
+        size=512,
+        placeholder_token=args.placeholder_token,
+        template=args.template,
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False)
+    for step, batch in enumerate(train_dataloader):
+        if args.selected_data > -1 and step != args.selected_data:
+            continue
+        batch["pixel_values"] = batch["pixel_values"].to("cuda:0")
+        batch["pixel_values_clip"] = batch["pixel_values_clip"].to("cuda:0").half()
+        batch["pixel_values_obj"] = batch["pixel_values_obj"].to("cuda:0").half()
+        batch["pixel_values_seg"] = batch["pixel_values_seg"].to("cuda:0").half()
+        batch["input_ids"] = batch["input_ids"].to("cuda:0")
+        batch["index"] = batch["index"].to("cuda:0").long()
+        print(step, batch['text'])
+        seeds = [0, 42, 10086, 777, 555, 222, 111, 999, 327, 283, 190, 218, 2371, 9329, 2938, 2073, 27367, 293,
+                 8269, 87367, 29379, 4658, 39, 598]
+        seeds = sorted(seeds)
+        for seed in seeds:
+            syn_images = validation(batch, tokenizer, image_encoder, text_encoder, unet, mapper, mapper_local, vae,
+                                    batch["pixel_values_clip"].device, 5,
+                                    seed=seed, llambda=float(args.llambda))
+            concat = np.concatenate((np.array(syn_images[0]), th2image(batch["pixel_values"][0])), axis=1)
+            Image.fromarray(concat).save(os.path.join(save_dir, f'{str(step).zfill(5)}_{str(seed).zfill(5)}.jpg'))

inference_local.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR='./test_datasets/'
+CUDA_VISIBLE_DEVICES=7 python inference_local.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --test_data_dir=$DATA_DIR \
+  --output_dir="./outputs/local_mapping"  \
+  --suffix="object" \
+  --template="a photo of a {}" \
+  --llambda="0.8" \
+  --global_mapper_path="./checkpoints/global_mapper.pt" \
+  --local_mapper_path="./checkpoints/local_mapper.pt"

test_datasets/1.jpg ADDED Viewed

test_datasets/10.jpg ADDED Viewed

test_datasets/10_bg.png ADDED Viewed

test_datasets/11.jpg ADDED Viewed

test_datasets/11_bg.png ADDED Viewed

test_datasets/15.jpg ADDED Viewed

test_datasets/15_bg.png ADDED Viewed

test_datasets/16.jpg ADDED Viewed

test_datasets/16_bg.png ADDED Viewed

test_datasets/17.jpg ADDED Viewed

test_datasets/17_bg.png ADDED Viewed

test_datasets/1_bg.png ADDED Viewed

test_datasets/2.jpg ADDED Viewed

test_datasets/20.jpg ADDED Viewed

test_datasets/20_bg.png ADDED Viewed

test_datasets/2_bg.png ADDED Viewed

test_datasets/3.jpg ADDED Viewed

test_datasets/3_bg.png ADDED Viewed

test_datasets/4.png ADDED Viewed

test_datasets/4_bg.png ADDED Viewed

test_datasets/7.jpg ADDED Viewed

test_datasets/7_bg.png ADDED Viewed

train_global.py ADDED Viewed

	@@ -0,0 +1,715 @@

+import argparse
+import itertools
+import math
+import os
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+import PIL
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel, LMSDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, whoami
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.models.clip.configuration_clip import CLIPTextConfig
+from transformers.models.clip.modeling_clip import CLIP_TEXT_INPUTS_DOCSTRING, _expand_mask
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModel
+from typing import Any, Optional, Tuple, Union
+from datasets import OpenImagesDataset
+class Mapper(nn.Module):
+    def __init__(self,
+        input_dim: int,
+        output_dim: int,
+    ):
+        super(Mapper, self).__init__()
+        for i in range(5):
+            setattr(self, f'mapping_{i}', nn.Sequential(nn.Linear(input_dim, 1024),
+                                         nn.LayerNorm(1024),
+                                         nn.LeakyReLU(),
+                                         nn.Linear(1024, 1024),
+                                         nn.LayerNorm(1024),
+                                         nn.LeakyReLU(),
+                                         nn.Linear(1024, output_dim)))
+            setattr(self, f'mapping_patch_{i}', nn.Sequential(nn.Linear(input_dim, 1024),
+                                                        nn.LayerNorm(1024),
+                                                        nn.LeakyReLU(),
+                                                        nn.Linear(1024, 1024),
+                                                        nn.LayerNorm(1024),
+                                                        nn.LeakyReLU(),
+                                                        nn.Linear(1024, output_dim)))
+    def forward(self, embs):
+        hidden_states = ()
+        for i, emb in enumerate(embs):
+            hidden_state = getattr(self, f'mapping_{i}')(emb[:, :1]) + getattr(self, f'mapping_patch_{i}')(emb[:, 1:]).mean(dim=1, keepdim=True)
+            hidden_states += (hidden_state, )
+        hidden_states = torch.cat(hidden_states, dim=1)
+        return hidden_states
+def _build_causal_attention_mask(bsz, seq_len, dtype):
+    # lazily create causal attention mask, with full attention between the vision tokens
+    # pytorch uses additive attention mask; fill with -inf
+    mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+    mask.fill_(torch.tensor(torch.finfo(dtype).min))
+    mask.triu_(1)  # zero out the lower diagonal
+    mask = mask.unsqueeze(1)  # expand mask
+    return mask
+@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+def inj_forward_text(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+) -> Union[Tuple, BaseModelOutputWithPooling]:
+    r"""
+    Returns:
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    if input_ids is None:
+        raise ValueError("You have to specify either input_ids")
+    r_input_ids = input_ids['input_ids']
+    if 'inj_embedding' in input_ids:
+        inj_embedding = input_ids['inj_embedding']
+        inj_index = input_ids['inj_index']
+    else:
+        inj_embedding = None
+        inj_index = None
+    input_shape = r_input_ids.size()
+    r_input_ids = r_input_ids.view(-1, input_shape[-1])
+    inputs_embeds = self.embeddings.token_embedding(r_input_ids)
+    new_inputs_embeds = inputs_embeds.clone()
+    if inj_embedding is not None:
+        emb_length = inj_embedding.shape[1]
+        for bsz, idx in enumerate(inj_index):
+            lll = new_inputs_embeds[bsz, idx+emb_length:].shape[0]
+            new_inputs_embeds[bsz, idx+emb_length:] = inputs_embeds[bsz, idx+1:idx+1+lll]
+            new_inputs_embeds[bsz, idx:idx+emb_length] = inj_embedding[bsz]
+    hidden_states = self.embeddings(input_ids=r_input_ids, position_ids=position_ids, inputs_embeds=new_inputs_embeds)
+    bsz, seq_len = input_shape
+    # CLIP's text model uses causal mask, prepare it here.
+    # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+    causal_attention_mask = _build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+        hidden_states.device
+    )
+    # expand attention_mask
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+    encoder_outputs = self.encoder(
+        inputs_embeds=hidden_states,
+        attention_mask=attention_mask,
+        causal_attention_mask=causal_attention_mask,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+    last_hidden_state = encoder_outputs[0]
+    last_hidden_state = self.final_layer_norm(last_hidden_state)
+    # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+    # take features from the eot embedding (eot_token is the highest number in each sequence)
+    # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+    pooled_output = last_hidden_state[
+        torch.arange(last_hidden_state.shape[0], device=r_input_ids.device), r_input_ids.to(torch.int).argmax(dim=-1)
+    ]
+    if not return_dict:
+        return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+    return BaseModelOutputWithPooling(
+        last_hidden_state=last_hidden_state,
+        pooler_output=pooled_output,
+        hidden_states=encoder_outputs.hidden_states,
+        attentions=encoder_outputs.attentions,
+    )
+def inj_forward_crossattention(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+    context = encoder_hidden_states
+    if context is not None:
+        context_tensor = context["CONTEXT_TENSOR"]
+    else:
+        context_tensor = hidden_states
+    batch_size, sequence_length, _ = hidden_states.shape
+    query = self.to_q(hidden_states)
+    if context is not None:
+        key = self.to_k_global(context_tensor)
+        value = self.to_v_global(context_tensor)
+    else:
+        key = self.to_k(context_tensor)
+        value = self.to_v(context_tensor)
+    dim = query.shape[-1]
+    query = self.reshape_heads_to_batch_dim(query)
+    key = self.reshape_heads_to_batch_dim(key)
+    value = self.reshape_heads_to_batch_dim(value)
+    attention_scores = torch.matmul(query, key.transpose(-1, -2))
+    attention_scores = attention_scores * self.scale
+    attention_probs = attention_scores.softmax(dim=-1)
+    hidden_states = torch.matmul(attention_probs, value)
+    hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+    # linear proj
+    hidden_states = self.to_out[0](hidden_states)
+    # dropout
+    hidden_states = self.to_out[1](hidden_states)
+    return hidden_states
+logger = get_logger(__name__)
+def save_progress(mapper, accelerator, args, step=None):
+    logger.info("Saving embeddings")
+    state_dict = accelerator.unwrap_model(mapper).state_dict()
+    if step is not None:
+        torch.save(state_dict, os.path.join(args.output_dir, f"mapper_{str(step).zfill(6)}.pt"))
+    else:
+        torch.save(state_dict, os.path.join(args.output_dir, "mapper.pt"))
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--global_mapper_path", type=str, default=None, help="If not none, the training will start from the given checkpoints."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+    return args
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+def unfreeze_params(params):
+    for param in params:
+        param.requires_grad = True
+def th2image(image):
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(1, 2, 0).numpy()
+    image = (image * 255).round().astype("uint8")
+    return Image.fromarray(image)
+@torch.no_grad()
+def validation(example, tokenizer, image_encoder, text_encoder, unet, mapper, vae, device, guidance_scale, token_index='full', seed=None):
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+    uncond_input = tokenizer(
+        [''] * example["pixel_values"].shape[0],
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        return_tensors="pt",
+    )
+    uncond_embeddings = text_encoder({'input_ids':uncond_input.input_ids.to(device)})[0]
+    if seed is None:
+        latents = torch.randn(
+            (example["pixel_values"].shape[0], unet.in_channels, 64, 64)
+        )
+    else:
+        generator = torch.manual_seed(seed)
+        latents = torch.randn(
+            (example["pixel_values"].shape[0], unet.in_channels, 64, 64), generator=generator,
+        )
+    latents = latents.to(example["pixel_values_clip"])
+    scheduler.set_timesteps(100)
+    latents = latents * scheduler.init_noise_sigma
+    placeholder_idx = example["index"]
+    image = F.interpolate(example["pixel_values_clip"], (224, 224), mode='bilinear')
+    image_features = image_encoder(image, output_hidden_states=True)
+    image_embeddings = [image_features[0], image_features[2][4], image_features[2][8], image_features[2][12],
+                        image_features[2][16]]
+    image_embeddings = [emb.detach() for emb in image_embeddings]
+    inj_embedding = mapper(image_embeddings)
+    if token_index != 'full':
+        token_index = int(token_index)
+        inj_embedding = inj_embedding[:, token_index:token_index + 1, :]
+    encoder_hidden_states = text_encoder({'input_ids': example["input_ids"],
+                                          "inj_embedding": inj_embedding,
+                                          "inj_index": placeholder_idx})[0]
+    for t in tqdm(scheduler.timesteps):
+        latent_model_input = scheduler.scale_model_input(latents, t)
+        noise_pred_text = unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states={
+                "CONTEXT_TENSOR": encoder_hidden_states,
+            }
+        ).sample
+        latent_model_input = scheduler.scale_model_input(latents, t)
+        noise_pred_uncond = unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states={
+                "CONTEXT_TENSOR": uncond_embeddings,
+            }
+        ).sample
+        noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+        )
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+    _latents = 1 / 0.18215 * latents.clone()
+    images = vae.decode(_latents).sample
+    ret_pil_images = [th2image(image) for image in images]
+    return ret_pil_images
+def main():
+    args = parse_args()
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    # replace the forward method of the text encoder to inject the word embedding
+    for _module in text_encoder.modules():
+        if _module.__class__.__name__ == "CLIPTextTransformer":
+            _module.__class__.__call__ = inj_forward_text
+    image_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14")
+    mapper = Mapper(input_dim=1024, output_dim=768)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+    # replace the forward method of the crossattention to finetune the to_k and to_v layers
+    for _name, _module in unet.named_modules():
+        if _module.__class__.__name__ == "CrossAttention":
+            if 'attn1' in _name: continue
+            _module.__class__.__call__ = inj_forward_crossattention
+            shape = _module.to_k.weight.shape
+            to_k_global = nn.Linear(shape[1], shape[0], bias=False)
+            to_k_global.weight.data = _module.to_k.weight.data.clone()
+            mapper.add_module(f'{_name.replace(".", "_")}_to_k', to_k_global)
+            shape = _module.to_v.weight.shape
+            to_v_global = nn.Linear(shape[1], shape[0], bias=False)
+            to_v_global.weight.data = _module.to_v.weight.data.clone()
+            mapper.add_module(f'{_name.replace(".", "_")}_to_v', to_v_global)
+            if args.global_mapper_path is None:
+                _module.add_module('to_k_global', to_k_global)
+                _module.add_module('to_v_global', to_v_global)
+    if args.global_mapper_path is not None:
+        mapper.load_state_dict(torch.load(args.global_mapper_path, map_location='cpu'))
+        for _name, _module in unet.named_modules():
+            if _module.__class__.__name__ == "CrossAttention":
+                if 'attn1' in _name: continue
+                _module.add_module('to_k_global', getattr(mapper, f'{_name.replace(".", "_")}_to_k'))
+                _module.add_module('to_v_global', getattr(mapper, f'{_name.replace(".", "_")}_to_v'))
+    # Freeze vae and unet, encoder
+    freeze_params(vae.parameters())
+    freeze_params(unet.parameters())
+    freeze_params(text_encoder.parameters())
+    freeze_params(image_encoder.parameters())
+    # Unfreeze the mapper
+    unfreeze_params(mapper.parameters())
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        itertools.chain(mapper.parameters()),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
+    train_dataset = OpenImagesDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        set="test",
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    mapper, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        mapper, optimizer, train_dataloader, lr_scheduler
+    )
+    # Move vae, unet, and encoders to device
+    vae.to(accelerator.device)
+    unet.to(accelerator.device)
+    image_encoder.to(accelerator.device)
+    text_encoder.to(accelerator.device)
+    # Keep vae, unet and image_encoder in eval model as we don't train these
+    vae.eval()
+    unet.eval()
+    image_encoder.eval()
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initialize automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("elite", config=vars(args))
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+    for epoch in range(args.num_train_epochs):
+        mapper.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(mapper):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                latents = latents * 0.18215
+                # Sample noise that we'll add to the latents
+                noise = torch.randn(latents.shape).to(latents.device)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                ).long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                placeholder_idx = batch["index"]
+                image = F.interpolate(batch["pixel_values_clip"], (224, 224), mode='bilinear')
+                image_features = image_encoder(image, output_hidden_states=True)
+                image_embeddings = [image_features[0], image_features[2][4], image_features[2][8], image_features[2][12], image_features[2][16]]
+                image_embeddings = [emb.detach() for emb in image_embeddings]
+                inj_embedding = mapper(image_embeddings)
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder({'input_ids': batch["input_ids"],
+                                                      "inj_embedding": inj_embedding,
+                                                      "inj_index": placeholder_idx.detach()})[0]
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states={
+                    "CONTEXT_TENSOR": encoder_hidden_states,
+                }).sample
+                loss_mle = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                loss_reg = torch.mean(torch.abs(inj_embedding)) * 0.01
+                loss = loss_mle + loss_reg
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(mapper.parameters(), 1)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_progress(mapper, accelerator, args, global_step)
+                    syn_images = validation(batch, tokenizer, image_encoder, text_encoder, unet, mapper, vae, batch["pixel_values_clip"].device, 5)
+                    gt_images = [th2image(img) for img in batch["pixel_values"]]
+                    img_list = []
+                    for syn, gt in zip(syn_images, gt_images):
+                        img_list.append(np.concatenate((np.array(syn), np.array(gt)), axis=1))
+                    img_list = np.concatenate(img_list, axis=0)
+                    Image.fromarray(img_list).save(os.path.join(args.output_dir, f"{str(global_step).zfill(5)}.jpg"))
+            logs = {"loss_mle": loss_mle.detach().item(), "loss_reg": loss_reg.detach().item(),  "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+        accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        save_progress(mapper, accelerator, args)
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

train_global.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR='/home/weiyuxiang/datasets/Open_Images/'
+CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch --config_file 4_gpu.json --main_process_port 25656 train_global.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --placeholder_token="S" \
+  --resolution=512 \
+  --train_batch_size=4 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=200000 \
+  --learning_rate=1e-06 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="./elite_experiments/global_mapping" \
+  --save_steps 200

train_local.py ADDED Viewed

	@@ -0,0 +1,709 @@

+import argparse
+import itertools
+import math
+import os
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+import PIL
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel, LMSDiscreteScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from huggingface_hub import HfFolder, Repository, whoami
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModel
+from typing import Optional
+from train_global import inj_forward_text, th2image, Mapper
+from datasets import OpenImagesDatasetWithMask
+class MapperLocal(nn.Module):
+    def __init__(self,
+         input_dim: int,
+         output_dim: int,
+    ):
+        super(MapperLocal, self).__init__()
+        for i in range(5):
+            setattr(self, f'mapping_{i}', nn.Sequential(nn.Linear(input_dim, 1024),
+                                                        nn.LayerNorm(1024),
+                                                        nn.LeakyReLU(),
+                                                        nn.Linear(1024, 1024),
+                                                        nn.LayerNorm(1024),
+                                                        nn.LeakyReLU(),
+                                                        nn.Linear(1024, output_dim)))
+            setattr(self, f'mapping_patch_{i}', nn.Sequential(nn.Linear(input_dim, 1024),
+                                                              nn.LayerNorm(1024),
+                                                              nn.LeakyReLU(),
+                                                              nn.Linear(1024, 1024),
+                                                              nn.LayerNorm(1024),
+                                                              nn.LeakyReLU(),
+                                                              nn.Linear(1024, output_dim)))
+    def forward(self, embs):
+        hidden_states = ()
+        for i, emb in enumerate(embs):
+            hidden_state = getattr(self, f'mapping_{i}')(emb[:, :1]) + getattr(self, f'mapping_patch_{i}')(emb[:, 1:])
+            hidden_states += (hidden_state.unsqueeze(0),)
+        hidden_states = torch.cat(hidden_states, dim=0).mean(dim=0)
+        return hidden_states
+value_local_list = []
+def inj_forward_crossattention(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+    context = encoder_hidden_states
+    hidden_states_local = hidden_states.clone()
+    if context is not None:
+        context_tensor = context["CONTEXT_TENSOR"]
+    else:
+        context_tensor = hidden_states
+    batch_size, sequence_length, _ = hidden_states.shape
+    query = self.to_q(hidden_states)
+    if context is not None:
+        key = self.to_k_global(context_tensor)
+        value = self.to_v_global(context_tensor)
+    else:
+        key = self.to_k(context_tensor)
+        value = self.to_v(context_tensor)
+    dim = query.shape[-1]
+    query = self.reshape_heads_to_batch_dim(query)
+    key = self.reshape_heads_to_batch_dim(key)
+    value = self.reshape_heads_to_batch_dim(value)
+    attention_scores = torch.matmul(query, key.transpose(-1, -2))
+    attention_scores = attention_scores * self.scale
+    attention_probs = attention_scores.softmax(dim=-1)
+    hidden_states = torch.matmul(attention_probs, value)
+    if context is not None and "LOCAL" in context:
+        # Perform cross attention with the local context
+        query_local = self.to_q(hidden_states_local)
+        key_local = self.to_k_local(context["LOCAL"])
+        value_local = self.to_v_local(context["LOCAL"])
+        query_local = self.reshape_heads_to_batch_dim(query_local)
+        key_local = self.reshape_heads_to_batch_dim(key_local)
+        value_local = self.reshape_heads_to_batch_dim(value_local)
+        attention_scores_local = torch.matmul(query_local, key_local.transpose(-1, -2))
+        attention_scores_local = attention_scores_local * self.scale
+        attention_probs_local = attention_scores_local.softmax(dim=-1)
+        # To extract the attmap of learned [w]
+        index_local = context["LOCAL_INDEX"]
+        index_local = index_local.reshape(index_local.shape[0], 1).repeat((1, self.heads)).reshape(-1)
+        attention_probs_clone = attention_probs.clone().permute((0, 2, 1))
+        attention_probs_mask = attention_probs_clone[torch.arange(index_local.shape[0]), index_local]
+        # Normalize the attention map
+        attention_probs_mask = attention_probs_mask.unsqueeze(2) / attention_probs_mask.max()
+        if "LAMBDA" in context:
+            _lambda = context["LAMBDA"]
+        else:
+            _lambda = 1
+        attention_probs_local = attention_probs_local * attention_probs_mask * _lambda
+        hidden_states += torch.matmul(attention_probs_local, value_local)
+        value_local_list.append(value_local)
+    hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+    # linear proj
+    hidden_states = self.to_out[0](hidden_states)
+    # dropout
+    hidden_states = self.to_out[1](hidden_states)
+    return hidden_states
+# ------------------------------------------------------------------------------
+logger = get_logger(__name__)
+def save_progress(mapper, accelerator, args, step=None):
+    logger.info("Saving embeddings")
+    state_dict = accelerator.unwrap_model(mapper).state_dict()
+    if step is not None:
+        torch.save(state_dict, os.path.join(args.output_dir, f"local_mapper_{str(step).zfill(6)}.pt"))
+    else:
+        torch.save(state_dict, os.path.join(args.output_dir, "local_mapper.pt"))
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--global_mapper_path", type=str, default=None,
+        help="If not none, the training will start from the given checkpoints."
+    )
+    parser.add_argument(
+        "--local_mapper_path", type=str, default=None,
+        help="If not none, the training will start from the given checkpoints."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+    return args
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+def unfreeze_params(params):
+    for param in params:
+        param.requires_grad = True
+@torch.no_grad()
+def validation(example, tokenizer, image_encoder, text_encoder, unet, mapper, mapper_local, vae, device, guidance_scale, seed=None, llambda=1):
+    scheduler = LMSDiscreteScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+    uncond_input = tokenizer(
+        [''] * example["pixel_values"].shape[0],
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        return_tensors="pt",
+    )
+    uncond_embeddings = text_encoder({'input_ids':uncond_input.input_ids.to(device)})[0]
+    if seed is None:
+        latents = torch.randn(
+            (example["pixel_values"].shape[0], unet.in_channels, 64, 64)
+        )
+    else:
+        generator = torch.manual_seed(seed)
+        latents = torch.randn(
+            (example["pixel_values"].shape[0], unet.in_channels, 64, 64), generator=generator,
+        )
+    latents = latents.to(example["pixel_values_clip"])
+    scheduler.set_timesteps(100)
+    latents = latents * scheduler.init_noise_sigma
+    placeholder_idx = example["index"]
+    image = F.interpolate(example["pixel_values_clip"], (224, 224), mode='bilinear')
+    image_features = image_encoder(image, output_hidden_states=True)
+    image_embeddings = [image_features[0], image_features[2][4], image_features[2][8], image_features[2][12], image_features[2][16]]
+    image_embeddings = [emb.detach() for emb in image_embeddings]
+    inj_embedding = mapper(image_embeddings)
+    inj_embedding = inj_embedding[:, 0:1, :]
+    encoder_hidden_states = text_encoder({'input_ids': example["input_ids"],
+                                          "inj_embedding": inj_embedding,
+                                          "inj_index": placeholder_idx})[0]
+    image_obj = F.interpolate(example["pixel_values_obj"], (224, 224), mode='bilinear')
+    image_features_obj = image_encoder(image_obj, output_hidden_states=True)
+    image_embeddings_obj = [image_features_obj[0], image_features_obj[2][4], image_features_obj[2][8],
+                            image_features_obj[2][12], image_features_obj[2][16]]
+    image_embeddings_obj = [emb.detach() for emb in image_embeddings_obj]
+    inj_embedding_local = mapper_local(image_embeddings_obj)
+    mask = F.interpolate(example["pixel_values_seg"], (16, 16), mode='nearest')
+    mask = mask[:, 0].reshape(mask.shape[0], -1, 1)
+    inj_embedding_local = inj_embedding_local * mask
+    for t in tqdm(scheduler.timesteps):
+        latent_model_input = scheduler.scale_model_input(latents, t)
+        noise_pred_text = unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states={
+                "CONTEXT_TENSOR": encoder_hidden_states,
+                "LOCAL": inj_embedding_local,
+                "LOCAL_INDEX": placeholder_idx.detach(),
+                "LAMBDA": llambda
+            }
+        ).sample
+        value_local_list.clear()
+        latent_model_input = scheduler.scale_model_input(latents, t)
+        noise_pred_uncond = unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states={
+                "CONTEXT_TENSOR": uncond_embeddings,
+            }
+        ).sample
+        value_local_list.clear()
+        noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+        )
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+    _latents = 1 / 0.18215 * latents.clone()
+    images = vae.decode(_latents).sample
+    ret_pil_images = [th2image(image) for image in images]
+    return ret_pil_images
+def main():
+    args = parse_args()
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load the tokenizer and add the placeholder token as a additional special token
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    for _module in text_encoder.modules():
+        if _module.__class__.__name__ == "CLIPTextTransformer":
+            _module.__class__.__call__ = inj_forward_text
+    image_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14")
+    mapper = Mapper(input_dim=1024, output_dim=768)
+    mapper_local = MapperLocal(input_dim=1024, output_dim=768)
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+    # replace the forward method of the crossattention to finetune the to_k and to_v layers
+    for _name, _module in unet.named_modules():
+        if _module.__class__.__name__ == "CrossAttention":
+            if 'attn1' in _name: continue
+            _module.__class__.__call__ = inj_forward_crossattention
+            shape = _module.to_k.weight.shape
+            to_k_global = nn.Linear(shape[1], shape[0], bias=False)
+            to_k_global.weight.data = _module.to_k.weight.data.clone()
+            mapper.add_module(f'{_name.replace(".", "_")}_to_k', to_k_global)
+            shape = _module.to_v.weight.shape
+            to_v_global = nn.Linear(shape[1], shape[0], bias=False)
+            to_v_global.weight.data = _module.to_v.weight.data.clone()
+            mapper.add_module(f'{_name.replace(".", "_")}_to_v', to_v_global)
+            to_k_local = nn.Linear(shape[1], shape[0], bias=False)
+            to_k_local.weight.data = _module.to_k.weight.data.clone()
+            mapper_local.add_module(f'{_name.replace(".", "_")}_to_k', to_k_local)
+            _module.add_module('to_k_local', to_k_local)
+            to_v_local = nn.Linear(shape[1], shape[0], bias=False)
+            to_v_local.weight.data = _module.to_v.weight.data.clone()
+            mapper_local.add_module(f'{_name.replace(".", "_")}_to_v', to_v_local)
+            _module.add_module('to_v_local', to_v_local)
+            if args.global_mapper_path is None:
+                _module.add_module('to_k_global', to_k_global)
+                _module.add_module('to_v_global', to_v_global)
+            if args.local_mapper_path is None:
+                _module.add_module('to_k_local', to_k_local)
+                _module.add_module('to_v_local', to_v_local)
+    if args.global_mapper_path is not None:
+        mapper.load_state_dict(torch.load(args.global_mapper_path, map_location='cpu'))
+        for _name, _module in unet.named_modules():
+            if _module.__class__.__name__ == "CrossAttention":
+                if 'attn1' in _name: continue
+                _module.add_module('to_k_global', getattr(mapper, f'{_name.replace(".", "_")}_to_k'))
+                _module.add_module('to_v_global', getattr(mapper, f'{_name.replace(".", "_")}_to_v'))
+    if args.local_mapper_path is not None:
+        mapper_local.load_state_dict(torch.load(args.local_mapper_path, map_location='cpu'))
+        for _name, _module in unet.named_modules():
+            if _module.__class__.__name__ == "CrossAttention":
+                if 'attn1' in _name: continue
+                _module.add_module('to_k_local', getattr(mapper_local, f'{_name.replace(".", "_")}_to_k'))
+                _module.add_module('to_v_local', getattr(mapper_local, f'{_name.replace(".", "_")}_to_v'))
+    # Freeze vae and unet
+    freeze_params(vae.parameters())
+    freeze_params(unet.parameters())
+    freeze_params(text_encoder.parameters())
+    freeze_params(image_encoder.parameters())
+    unfreeze_params(mapper_local.parameters())
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        itertools.chain(mapper_local.parameters()),  # only optimize the embeddings
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
+    train_dataset = OpenImagesDatasetWithMask(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        set="test"
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    mapper_local, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        mapper_local, optimizer, train_dataloader, lr_scheduler
+    )
+    # Move vae and unet to device
+    vae.to(accelerator.device)
+    unet.to(accelerator.device)
+    image_encoder.to(accelerator.device)
+    text_encoder.to(accelerator.device)
+    mapper.to(accelerator.device)
+    # Keep vae and unet in eval model as we don't train these
+    vae.eval()
+    unet.eval()
+    image_encoder.eval()
+    mapper.eval()
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initialize automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("elite", config=vars(args))
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+    for epoch in range(args.num_train_epochs):
+        mapper_local.train()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(mapper_local):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                latents = latents * 0.18215
+                # Sample noise that we'll add to the latents
+                noise = torch.randn(latents.shape).to(latents.device)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                ).long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                placeholder_idx = batch["index"]
+                image = F.interpolate(batch["pixel_values_clip"], (224, 224), mode='bilinear')
+                image_obj = F.interpolate(batch["pixel_values_obj"], (224, 224), mode='bilinear')
+                mask = F.interpolate(batch["pixel_values_seg"], (16, 16), mode='nearest')
+                mask = mask[:, 0].reshape(mask.shape[0], -1, 1)
+                image_features = image_encoder(image, output_hidden_states=True)
+                image_embeddings = [image_features[0], image_features[2][4], image_features[2][8], image_features[2][12], image_features[2][16]]
+                image_embeddings = [emb.detach() for emb in image_embeddings]
+                inj_embedding = mapper(image_embeddings)
+                # only use the first word
+                inj_embedding = inj_embedding[:, 0:1, :]
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder({'input_ids': batch["input_ids"],
+                                                      "inj_embedding": inj_embedding,
+                                                      "inj_index": placeholder_idx.detach()})[0]
+                image_features_obj = image_encoder(image_obj, output_hidden_states=True)
+                image_embeddings_obj = [image_features_obj[0], image_features_obj[2][4], image_features_obj[2][8], image_features_obj[2][12], image_features_obj[2][16]]
+                image_embeddings_obj = [emb.detach() for emb in image_embeddings_obj]
+                inj_embedding_local = mapper_local(image_embeddings_obj)
+                inj_embedding_local = inj_embedding_local * mask
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states={
+                    "CONTEXT_TENSOR": encoder_hidden_states,
+                    "LOCAL": inj_embedding_local,
+                    "LOCAL_INDEX": placeholder_idx.detach()
+                }).sample
+                mask_values = batch["mask_values"]
+                loss_mle = F.mse_loss(noise_pred, noise, reduction="none")
+                loss_mle = ((loss_mle*mask_values).sum([1, 2, 3])/mask_values.sum([1, 2, 3])).mean()
+                loss_reg = 0
+                for vvv in value_local_list:
+                    loss_reg += torch.mean(torch.abs(vvv))
+                loss_reg = loss_reg / len(value_local_list) * 0.0001
+                loss = loss_mle + loss_reg
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(mapper_local.parameters(), 1)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                value_local_list.clear()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_progress(mapper_local, accelerator, args, global_step)
+                    syn_images = validation(batch, tokenizer, image_encoder, text_encoder, unet, mapper, mapper_local, vae, batch["pixel_values_clip"].device, 5)
+                    input_images = [th2image(img) for img in batch["pixel_values"]]
+                    clip_images = [th2image(img).resize((512, 512)) for img in batch["pixel_values_clip"]]
+                    obj_images = [th2image(img).resize((512, 512)) for img in batch["pixel_values_obj"]]
+                    input_masks = torch.cat([mask_values, mask_values, mask_values], dim=1)
+                    input_masks = [th2image(img).resize((512, 512)) for img in input_masks]
+                    obj_masks = [th2image(img).resize((512, 512)) for img in batch["pixel_values_seg"]]
+                    img_list = []
+                    for syn, input_img, input_mask, clip_image, obj_image, obj_mask in zip(syn_images, input_images, input_masks, clip_images, obj_images, obj_masks):
+                        img_list.append(np.concatenate((np.array(syn), np.array(input_img), np.array(input_mask), np.array(clip_image), np.array(obj_image), np.array(obj_mask)), axis=1))
+                    img_list = np.concatenate(img_list, axis=0)
+                    Image.fromarray(img_list).save(os.path.join(args.output_dir, f"{str(global_step).zfill(5)}.jpg"))
+            logs = {"loss_mle": loss_mle.detach().item(), "loss_reg": loss_reg.detach().item(),  "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+        accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        save_progress(mapper_local, accelerator, args)
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

train_local.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR='/home/weiyuxiang/datasets/Open_Images/'
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --config_file 4_gpu.json --main_process_port 25657 train_local.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --placeholder_token="S" \
+  --resolution=512 \
+  --train_batch_size=2 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=200000 \
+  --learning_rate=1e-5 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --global_mapper_path "./elite_experiments/global_mapping/mapper_070000.pt" \
+  --output_dir="./elite_experiments/local_mapping" \
+  --save_steps 200