Spaces:
Runtime error
Runtime error
Quα»³nh PhΓΉng
commited on
Commit
β’
ce7c64a
1
Parent(s):
589b7f1
update
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- __pycache__/app.cpython-38.pyc +0 -0
- __pycache__/example_component.cpython-38.pyc +0 -0
- dataset/__init__.py +0 -0
- dataset/__pycache__/__init__.cpython-38.pyc +0 -0
- dataset/__pycache__/catalog.cpython-38.pyc +0 -0
- dataset/__pycache__/concat_dataset.cpython-38.pyc +0 -0
- dataset/base_dataset.py +220 -0
- dataset/catalog.py +72 -0
- dataset/cd_dataset.py +250 -0
- dataset/concat_dataset.py +65 -0
- dataset/grounding_dataset.py +205 -0
- dataset/layout_dataset.py +237 -0
- dataset/tsv.py +212 -0
- dataset/tsv_dataset.py +326 -0
- dataset/utils.py +116 -0
- gligen/__pycache__/__init__.cpython-38.pyc +0 -0
- gligen/__pycache__/distributed.cpython-38.pyc +0 -0
- gligen/__pycache__/evaluator.cpython-38.pyc +0 -0
- gligen/__pycache__/task_grounded_generation.cpython-38.pyc +0 -0
- gligen/__pycache__/trainer.cpython-38.pyc +0 -0
- gligen/ldm/__pycache__/util.cpython-38.pyc +0 -0
- gligen/ldm/models/.DS_Store +0 -0
- gligen/ldm/models/__pycache__/autoencoder.cpython-38.pyc +0 -0
- gligen/ldm/models/autoencoder.py +52 -0
- gligen/ldm/models/diffusion/__init__.py +0 -0
- gligen/ldm/models/diffusion/__pycache__/__init__.cpython-38.pyc +0 -0
- gligen/ldm/models/diffusion/__pycache__/ddim.cpython-38.pyc +0 -0
- gligen/ldm/models/diffusion/__pycache__/ddpm.cpython-38.pyc +0 -0
- gligen/ldm/models/diffusion/__pycache__/gaussian_smoothing.cpython-38.pyc +0 -0
- gligen/ldm/models/diffusion/__pycache__/ldm.cpython-38.pyc +0 -0
- gligen/ldm/models/diffusion/__pycache__/loss.cpython-38.pyc +0 -0
- gligen/ldm/models/diffusion/__pycache__/plms.cpython-38.pyc +0 -0
- gligen/ldm/models/diffusion/classifier.py +267 -0
- gligen/ldm/models/diffusion/ddim.py +134 -0
- gligen/ldm/models/diffusion/ddpm.py +72 -0
- gligen/ldm/models/diffusion/gaussian_smoothing.py +119 -0
- gligen/ldm/models/diffusion/ldm.py +88 -0
- gligen/ldm/models/diffusion/loss.py +170 -0
- gligen/ldm/models/diffusion/plms.py +295 -0
- gligen/ldm/modules/__pycache__/attention.cpython-38.pyc +0 -0
- gligen/ldm/modules/__pycache__/x_transformer.cpython-38.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-37.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-38.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/convnext.cpython-38.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/model.cpython-38.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/normal_grounding_net.cpython-38.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/openaimodel.cpython-38.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/text_grounding_net.cpython-38.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-37.pyc +0 -0
- gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-38.pyc +0 -0
__pycache__/app.cpython-38.pyc
ADDED
Binary file (25.8 kB). View file
|
|
__pycache__/example_component.cpython-38.pyc
ADDED
Binary file (26.6 kB). View file
|
|
dataset/__init__.py
ADDED
File without changes
|
dataset/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (139 Bytes). View file
|
|
dataset/__pycache__/catalog.cpython-38.pyc
ADDED
Binary file (1.11 kB). View file
|
|
dataset/__pycache__/concat_dataset.cpython-38.pyc
ADDED
Binary file (1.88 kB). View file
|
|
dataset/base_dataset.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from PIL import Image, ImageDraw
|
3 |
+
import torchvision.transforms as transforms
|
4 |
+
import torchvision
|
5 |
+
from zipfile import ZipFile
|
6 |
+
import os
|
7 |
+
import multiprocessing
|
8 |
+
import math
|
9 |
+
import numpy as np
|
10 |
+
import random
|
11 |
+
from io import BytesIO
|
12 |
+
|
13 |
+
VALID_IMAGE_TYPES = ['.jpg', '.jpeg', '.tiff', '.bmp', '.png']
|
14 |
+
|
15 |
+
|
16 |
+
def check_filenames_in_zipdata(filenames, ziproot):
|
17 |
+
samples = []
|
18 |
+
for fst in ZipFile(ziproot).infolist():
|
19 |
+
fname = fst.filename
|
20 |
+
if fname.endswith('/') or fname.startswith('.') or fst.file_size == 0:
|
21 |
+
continue
|
22 |
+
if os.path.splitext(fname)[1].lower() in VALID_IMAGE_TYPES:
|
23 |
+
samples.append((fname))
|
24 |
+
filenames = set(filenames)
|
25 |
+
samples = set(samples)
|
26 |
+
assert filenames.issubset(samples), 'Something wrong with your zip data'
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
def draw_box(img, boxes):
|
31 |
+
colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
|
32 |
+
draw = ImageDraw.Draw(img)
|
33 |
+
for bid, box in enumerate(boxes):
|
34 |
+
draw.rectangle([box[0], box[1], box[2], box[3]], outline =colors[bid % len(colors)], width=4)
|
35 |
+
# draw.rectangle([box[0], box[1], box[2], box[3]], outline ="red", width=2) # x0 y0 x1 y1
|
36 |
+
return img
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
def to_valid(x0, y0, x1, y1, image_size, min_box_size):
|
41 |
+
valid = True
|
42 |
+
|
43 |
+
if x0>image_size or y0>image_size or x1<0 or y1<0:
|
44 |
+
valid = False # no way to make this box vide, it is completely cropped out
|
45 |
+
return valid, (None, None, None, None)
|
46 |
+
|
47 |
+
x0 = max(x0, 0)
|
48 |
+
y0 = max(y0, 0)
|
49 |
+
x1 = min(x1, image_size)
|
50 |
+
y1 = min(y1, image_size)
|
51 |
+
|
52 |
+
if (x1-x0)*(y1-y0) / (image_size*image_size) < min_box_size:
|
53 |
+
valid = False
|
54 |
+
return valid, (None, None, None, None)
|
55 |
+
|
56 |
+
return valid, (x0, y0, x1, y1)
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
def recalculate_box_and_verify_if_valid(x, y, w, h, trans_info, image_size, min_box_size):
|
63 |
+
"""
|
64 |
+
x,y,w,h: the original annotation corresponding to the raw image size.
|
65 |
+
trans_info: what resizing and cropping have been applied to the raw image
|
66 |
+
image_size: what is the final image size
|
67 |
+
"""
|
68 |
+
|
69 |
+
x0 = x * trans_info["performed_scale"] - trans_info['crop_x']
|
70 |
+
y0 = y * trans_info["performed_scale"] - trans_info['crop_y']
|
71 |
+
x1 = (x + w) * trans_info["performed_scale"] - trans_info['crop_x']
|
72 |
+
y1 = (y + h) * trans_info["performed_scale"] - trans_info['crop_y']
|
73 |
+
|
74 |
+
|
75 |
+
# at this point, box annotation has been recalculated based on scaling and cropping
|
76 |
+
# but some point may fall off the image_size region (e.g., negative value), thus we
|
77 |
+
# need to clamp them into 0-image_size. But if all points falling outsize of image
|
78 |
+
# region, then we will consider this is an invalid box.
|
79 |
+
valid, (x0, y0, x1, y1) = to_valid(x0, y0, x1, y1, image_size, min_box_size)
|
80 |
+
|
81 |
+
if valid:
|
82 |
+
# we also perform random flip.
|
83 |
+
# Here boxes are valid, and are based on image_size
|
84 |
+
if trans_info["performed_flip"]:
|
85 |
+
x0, x1 = image_size-x1, image_size-x0
|
86 |
+
|
87 |
+
return valid, (x0, y0, x1, y1)
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
class BaseDataset(torch.utils.data.Dataset):
|
92 |
+
def __init__(self, image_root, random_crop, random_flip, image_size):
|
93 |
+
super().__init__()
|
94 |
+
self.image_root = image_root
|
95 |
+
self.random_crop = random_crop
|
96 |
+
self.random_flip = random_flip
|
97 |
+
self.image_size = image_size
|
98 |
+
self.use_zip = False
|
99 |
+
|
100 |
+
if image_root[-4::] == 'zip':
|
101 |
+
self.use_zip = True
|
102 |
+
self.zip_dict = {}
|
103 |
+
|
104 |
+
if self.random_crop:
|
105 |
+
assert False, 'NOT IMPLEMENTED'
|
106 |
+
|
107 |
+
|
108 |
+
def fetch_zipfile(self, ziproot):
|
109 |
+
pid = multiprocessing.current_process().pid # get pid of this process.
|
110 |
+
if pid not in self.zip_dict:
|
111 |
+
self.zip_dict[pid] = ZipFile(ziproot)
|
112 |
+
zip_file = self.zip_dict[pid]
|
113 |
+
return zip_file
|
114 |
+
|
115 |
+
def fetch_image(self, filename):
|
116 |
+
if self.use_zip:
|
117 |
+
zip_file = self.fetch_zipfile(self.image_root)
|
118 |
+
image = Image.open( BytesIO(zip_file.read(filename)) ).convert('RGB')
|
119 |
+
return image
|
120 |
+
else:
|
121 |
+
image = Image.open( os.path.join(self.image_root,filename) ).convert('RGB')
|
122 |
+
return image
|
123 |
+
|
124 |
+
|
125 |
+
def vis_getitem_data(self, index=None, out=None, return_tensor=False, name="res.jpg", print_caption=True):
|
126 |
+
|
127 |
+
if out is None:
|
128 |
+
out = self[index]
|
129 |
+
|
130 |
+
img = torchvision.transforms.functional.to_pil_image( out["image"]*0.5+0.5 )
|
131 |
+
canvas = torchvision.transforms.functional.to_pil_image( torch.ones_like(out["image"]) )
|
132 |
+
W, H = img.size
|
133 |
+
|
134 |
+
if print_caption:
|
135 |
+
caption = out["caption"]
|
136 |
+
print(caption)
|
137 |
+
print(" ")
|
138 |
+
|
139 |
+
boxes = []
|
140 |
+
for box in out["boxes"]:
|
141 |
+
x0,y0,x1,y1 = box
|
142 |
+
boxes.append( [float(x0*W), float(y0*H), float(x1*W), float(y1*H)] )
|
143 |
+
img = draw_box(img, boxes)
|
144 |
+
|
145 |
+
if return_tensor:
|
146 |
+
return torchvision.transforms.functional.to_tensor(img)
|
147 |
+
else:
|
148 |
+
img.save(name)
|
149 |
+
|
150 |
+
|
151 |
+
def transform_image(self, pil_image):
|
152 |
+
if self.random_crop:
|
153 |
+
assert False
|
154 |
+
arr = random_crop_arr(pil_image, self.image_size)
|
155 |
+
else:
|
156 |
+
arr, info = center_crop_arr(pil_image, self.image_size)
|
157 |
+
|
158 |
+
info["performed_flip"] = False
|
159 |
+
if self.random_flip and random.random()<0.5:
|
160 |
+
arr = arr[:, ::-1]
|
161 |
+
info["performed_flip"] = True
|
162 |
+
|
163 |
+
arr = arr.astype(np.float32) / 127.5 - 1
|
164 |
+
arr = np.transpose(arr, [2,0,1])
|
165 |
+
|
166 |
+
return torch.tensor(arr), info
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
+
def center_crop_arr(pil_image, image_size):
|
171 |
+
# We are not on a new enough PIL to support the `reducing_gap`
|
172 |
+
# argument, which uses BOX downsampling at powers of two first.
|
173 |
+
# Thus, we do it by hand to improve downsample quality.
|
174 |
+
WW, HH = pil_image.size
|
175 |
+
|
176 |
+
while min(*pil_image.size) >= 2 * image_size:
|
177 |
+
pil_image = pil_image.resize(
|
178 |
+
tuple(x // 2 for x in pil_image.size), resample=Image.BOX
|
179 |
+
)
|
180 |
+
|
181 |
+
scale = image_size / min(*pil_image.size)
|
182 |
+
|
183 |
+
pil_image = pil_image.resize(
|
184 |
+
tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
|
185 |
+
)
|
186 |
+
|
187 |
+
# at this point, the min of pil_image side is desired image_size
|
188 |
+
performed_scale = image_size / min(WW, HH)
|
189 |
+
|
190 |
+
arr = np.array(pil_image)
|
191 |
+
crop_y = (arr.shape[0] - image_size) // 2
|
192 |
+
crop_x = (arr.shape[1] - image_size) // 2
|
193 |
+
|
194 |
+
info = {"performed_scale":performed_scale, 'crop_y':crop_y, 'crop_x':crop_x, "WW":WW, 'HH':HH}
|
195 |
+
|
196 |
+
return arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size], info
|
197 |
+
|
198 |
+
|
199 |
+
def random_crop_arr(pil_image, image_size, min_crop_frac=0.8, max_crop_frac=1.0):
|
200 |
+
min_smaller_dim_size = math.ceil(image_size / max_crop_frac)
|
201 |
+
max_smaller_dim_size = math.ceil(image_size / min_crop_frac)
|
202 |
+
smaller_dim_size = random.randrange(min_smaller_dim_size, max_smaller_dim_size + 1)
|
203 |
+
|
204 |
+
# We are not on a new enough PIL to support the `reducing_gap`
|
205 |
+
# argument, which uses BOX downsampling at powers of two first.
|
206 |
+
# Thus, we do it by hand to improve downsample quality.
|
207 |
+
while min(*pil_image.size) >= 2 * smaller_dim_size:
|
208 |
+
pil_image = pil_image.resize(
|
209 |
+
tuple(x // 2 for x in pil_image.size), resample=Image.BOX
|
210 |
+
)
|
211 |
+
|
212 |
+
scale = smaller_dim_size / min(*pil_image.size)
|
213 |
+
pil_image = pil_image.resize(
|
214 |
+
tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
|
215 |
+
)
|
216 |
+
|
217 |
+
arr = np.array(pil_image)
|
218 |
+
crop_y = random.randrange(arr.shape[0] - image_size + 1)
|
219 |
+
crop_x = random.randrange(arr.shape[1] - image_size + 1)
|
220 |
+
return arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size]
|
dataset/catalog.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
class DatasetCatalog:
|
4 |
+
def __init__(self, ROOT, which_embedder):
|
5 |
+
assert which_embedder in ['clip', 'bert']
|
6 |
+
|
7 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
8 |
+
|
9 |
+
|
10 |
+
self.VGGrounding = {
|
11 |
+
"target": "dataset.tsv_dataset.TSVDataset",
|
12 |
+
"train_params": dict(
|
13 |
+
tsv_path=os.path.join(ROOT,'GROUNDING/gqa/tsv/train-00.tsv'),
|
14 |
+
)
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
19 |
+
|
20 |
+
|
21 |
+
self.FlickrGrounding = {
|
22 |
+
"target": "dataset.tsv_dataset.TSVDataset",
|
23 |
+
"train_params":dict(
|
24 |
+
tsv_path=os.path.join(ROOT,'GROUNDING/flickr30k/tsv/train-00.tsv'),
|
25 |
+
)
|
26 |
+
}
|
27 |
+
|
28 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
29 |
+
|
30 |
+
self.SBUGrounding = {
|
31 |
+
"target": "dataset.tsv_dataset.TSVDataset",
|
32 |
+
"train_params":dict(
|
33 |
+
tsv_path=os.path.join(ROOT,'GROUNDING/SBU/tsv/train-00.tsv'),
|
34 |
+
)
|
35 |
+
}
|
36 |
+
|
37 |
+
|
38 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
39 |
+
|
40 |
+
|
41 |
+
self.CC3MGrounding = {
|
42 |
+
"target": "dataset.tsv_dataset.TSVDataset",
|
43 |
+
"train_params":dict(
|
44 |
+
tsv_path=os.path.join(ROOT,'GROUNDING/CC3M/tsv/train-00.tsv'),
|
45 |
+
)
|
46 |
+
}
|
47 |
+
|
48 |
+
|
49 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
50 |
+
|
51 |
+
|
52 |
+
self.CC12MGrounding = {
|
53 |
+
"target": "dataset.tsv_dataset.TSVDataset",
|
54 |
+
"train_params":dict(
|
55 |
+
tsv_path=os.path.join(ROOT,'GROUNDING/CC12M/tsv/train-00.tsv'),
|
56 |
+
)
|
57 |
+
}
|
58 |
+
|
59 |
+
|
60 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
61 |
+
|
62 |
+
# temp = 'category_embedding_clip.pth' if which_embedder == 'clip' else 'category_embedding_bert.pth'
|
63 |
+
# obj365_category_embedding_path = os.path.join(ROOT, 'OBJECTS365', temp)
|
64 |
+
|
65 |
+
self.Obj365Detection = {
|
66 |
+
"target": "dataset.tsv_dataset.TSVDataset",
|
67 |
+
"train_params":dict(
|
68 |
+
tsv_path=os.path.join(ROOT,'OBJECTS365/tsv/train-00.tsv'),
|
69 |
+
),
|
70 |
+
}
|
71 |
+
|
72 |
+
|
dataset/cd_dataset.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json, os, random, math
|
2 |
+
from collections import defaultdict
|
3 |
+
from copy import deepcopy
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from torch.utils.data import Dataset
|
7 |
+
import torchvision.transforms as transforms
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
from PIL import Image
|
11 |
+
from .base_dataset import BaseDataset, check_filenames_in_zipdata, recalculate_box_and_verify_if_valid
|
12 |
+
from io import BytesIO
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
def not_in_at_all(list1, list2):
|
17 |
+
for a in list1:
|
18 |
+
if a in list2:
|
19 |
+
return False
|
20 |
+
return True
|
21 |
+
|
22 |
+
|
23 |
+
def clean_annotations(annotations):
|
24 |
+
for anno in annotations:
|
25 |
+
anno.pop("segmentation", None)
|
26 |
+
anno.pop("area", None)
|
27 |
+
anno.pop("iscrowd", None)
|
28 |
+
# anno.pop("id", None)
|
29 |
+
|
30 |
+
|
31 |
+
def make_a_sentence(obj_names, clean=False):
|
32 |
+
|
33 |
+
if clean:
|
34 |
+
obj_names = [ name[:-6] if ("-other" in name) else name for name in obj_names]
|
35 |
+
|
36 |
+
caption = ""
|
37 |
+
tokens_positive = []
|
38 |
+
for obj_name in obj_names:
|
39 |
+
start_len = len(caption)
|
40 |
+
caption += obj_name
|
41 |
+
end_len = len(caption)
|
42 |
+
caption += ", "
|
43 |
+
tokens_positive.append(
|
44 |
+
[[start_len, end_len]] # in real caption, positive tokens can be disjoint, thus using list of list
|
45 |
+
)
|
46 |
+
caption = caption[:-2] # remove last ", "
|
47 |
+
|
48 |
+
return caption #, tokens_positive
|
49 |
+
|
50 |
+
|
51 |
+
def check_all_have_same_images(instances_data, stuff_data, caption_data):
|
52 |
+
if stuff_data is not None:
|
53 |
+
assert instances_data["images"] == stuff_data["images"]
|
54 |
+
if caption_data is not None:
|
55 |
+
assert instances_data["images"] == caption_data["images"]
|
56 |
+
|
57 |
+
|
58 |
+
class CDDataset(BaseDataset):
|
59 |
+
"CD: Caption Detection"
|
60 |
+
def __init__(self,
|
61 |
+
image_root,
|
62 |
+
category_embedding_path,
|
63 |
+
instances_json_path = None,
|
64 |
+
stuff_json_path = None,
|
65 |
+
caption_json_path = None,
|
66 |
+
prob_real_caption = 0,
|
67 |
+
fake_caption_type = 'empty',
|
68 |
+
image_size=256,
|
69 |
+
max_images=None,
|
70 |
+
min_box_size=0.01,
|
71 |
+
max_boxes_per_image=8,
|
72 |
+
include_other=False,
|
73 |
+
random_crop = False,
|
74 |
+
random_flip = True,
|
75 |
+
):
|
76 |
+
super().__init__(random_crop, random_flip, image_size)
|
77 |
+
|
78 |
+
self.image_root = image_root
|
79 |
+
self.category_embedding_path = category_embedding_path
|
80 |
+
self.instances_json_path = instances_json_path
|
81 |
+
self.stuff_json_path = stuff_json_path
|
82 |
+
self.caption_json_path = caption_json_path
|
83 |
+
self.prob_real_caption = prob_real_caption
|
84 |
+
self.fake_caption_type = fake_caption_type
|
85 |
+
self.max_images = max_images
|
86 |
+
self.min_box_size = min_box_size
|
87 |
+
self.max_boxes_per_image = max_boxes_per_image
|
88 |
+
self.include_other = include_other
|
89 |
+
|
90 |
+
|
91 |
+
assert fake_caption_type in ["empty", "made"]
|
92 |
+
if prob_real_caption > 0:
|
93 |
+
assert caption_json_path is not None, "caption json must be given"
|
94 |
+
|
95 |
+
|
96 |
+
# Load all jsons
|
97 |
+
with open(instances_json_path, 'r') as f:
|
98 |
+
instances_data = json.load(f) # keys: 'info', 'images', 'licenses', 'categories', 'annotations'
|
99 |
+
clean_annotations(instances_data["annotations"])
|
100 |
+
self.instances_data = instances_data
|
101 |
+
|
102 |
+
self.stuff_data = None
|
103 |
+
if stuff_json_path is not None:
|
104 |
+
with open(stuff_json_path, 'r') as f:
|
105 |
+
stuff_data = json.load(f) # keys: 'info', 'images', 'licenses', 'categories', 'annotations'
|
106 |
+
clean_annotations(stuff_data["annotations"])
|
107 |
+
self.stuff_data = stuff_data
|
108 |
+
|
109 |
+
self.captions_data = None
|
110 |
+
if caption_json_path is not None:
|
111 |
+
with open(caption_json_path, 'r') as f:
|
112 |
+
captions_data = json.load(f) # keys: 'info', 'images', 'licenses', 'categories', 'annotations'
|
113 |
+
clean_annotations(captions_data["annotations"])
|
114 |
+
self.captions_data = captions_data
|
115 |
+
|
116 |
+
|
117 |
+
# Load preprocessed name embedding
|
118 |
+
self.category_embeddings = torch.load(category_embedding_path)
|
119 |
+
self.embedding_len = list( self.category_embeddings.values() )[0].shape[0]
|
120 |
+
|
121 |
+
|
122 |
+
# Misc
|
123 |
+
self.image_ids = [] # main list for selecting images
|
124 |
+
self.image_id_to_filename = {} # file names used to read image
|
125 |
+
check_all_have_same_images(self.instances_data, self.stuff_data, self.captions_data)
|
126 |
+
for image_data in self.instances_data['images']:
|
127 |
+
image_id = image_data['id']
|
128 |
+
filename = image_data['file_name']
|
129 |
+
self.image_ids.append(image_id)
|
130 |
+
self.image_id_to_filename[image_id] = filename
|
131 |
+
|
132 |
+
|
133 |
+
# All category names (including things and stuff)
|
134 |
+
self.object_idx_to_name = {}
|
135 |
+
for category_data in self.instances_data['categories']:
|
136 |
+
self.object_idx_to_name[category_data['id']] = category_data['name']
|
137 |
+
if self.stuff_data is not None:
|
138 |
+
for category_data in self.stuff_data['categories']:
|
139 |
+
self.object_idx_to_name[category_data['id']] = category_data['name']
|
140 |
+
|
141 |
+
|
142 |
+
# Add object data from instances and stuff
|
143 |
+
self.image_id_to_objects = defaultdict(list)
|
144 |
+
self.select_objects( self.instances_data['annotations'] )
|
145 |
+
if self.stuff_data is not None:
|
146 |
+
self.select_objects( self.stuff_data['annotations'] )
|
147 |
+
|
148 |
+
# Add caption data
|
149 |
+
if self.captions_data is not None:
|
150 |
+
self.image_id_to_captions = defaultdict(list)
|
151 |
+
self.select_captions( self.captions_data['annotations'] )
|
152 |
+
|
153 |
+
# Check if all filenames can be found in the zip file
|
154 |
+
# all_filenames = [self.image_id_to_filename[idx] for idx in self.image_ids]
|
155 |
+
# check_filenames_in_zipdata(all_filenames, image_root)
|
156 |
+
|
157 |
+
|
158 |
+
def select_objects(self, annotations):
|
159 |
+
for object_anno in annotations:
|
160 |
+
image_id = object_anno['image_id']
|
161 |
+
object_name = self.object_idx_to_name[object_anno['category_id']]
|
162 |
+
other_ok = object_name != 'other' or self.include_other
|
163 |
+
if other_ok:
|
164 |
+
self.image_id_to_objects[image_id].append(object_anno)
|
165 |
+
|
166 |
+
|
167 |
+
def select_captions(self, annotations):
|
168 |
+
for caption_data in annotations:
|
169 |
+
image_id = caption_data['image_id']
|
170 |
+
self.image_id_to_captions[image_id].append(caption_data)
|
171 |
+
|
172 |
+
|
173 |
+
def total_images(self):
|
174 |
+
return len(self)
|
175 |
+
|
176 |
+
|
177 |
+
def __getitem__(self, index):
|
178 |
+
if self.max_boxes_per_image > 99:
|
179 |
+
assert False, "Are you sure setting such large number of boxes?"
|
180 |
+
|
181 |
+
out = {}
|
182 |
+
|
183 |
+
image_id = self.image_ids[index]
|
184 |
+
out['id'] = image_id
|
185 |
+
|
186 |
+
# Image
|
187 |
+
filename = self.image_id_to_filename[image_id]
|
188 |
+
image = self.fetch_image(filename)
|
189 |
+
#WW, HH = image.size
|
190 |
+
image_tensor, trans_info = self.transform_image(image)
|
191 |
+
out["image"] = image_tensor
|
192 |
+
|
193 |
+
|
194 |
+
# Select valid boxes after cropping (center or random)
|
195 |
+
this_image_obj_annos = deepcopy(self.image_id_to_objects[image_id])
|
196 |
+
areas = []
|
197 |
+
all_obj_names = []
|
198 |
+
all_boxes = []
|
199 |
+
all_masks = []
|
200 |
+
all_positive_embeddings = []
|
201 |
+
for object_anno in this_image_obj_annos:
|
202 |
+
|
203 |
+
x, y, w, h = object_anno['bbox']
|
204 |
+
valid, (x0, y0, x1, y1) = recalculate_box_and_verify_if_valid(x, y, w, h, trans_info, self.image_size, self.min_box_size)
|
205 |
+
|
206 |
+
if valid:
|
207 |
+
areas.append( (x1-x0)*(y1-y0) )
|
208 |
+
obj_name = self.object_idx_to_name[ object_anno['category_id'] ]
|
209 |
+
all_obj_names.append(obj_name)
|
210 |
+
all_boxes.append( torch.tensor([x0,y0,x1,y1]) / self.image_size ) # scale to 0-1
|
211 |
+
all_masks.append(1)
|
212 |
+
all_positive_embeddings.append( self.category_embeddings[obj_name] )
|
213 |
+
|
214 |
+
wanted_idxs = torch.tensor(areas).sort(descending=True)[1]
|
215 |
+
wanted_idxs = wanted_idxs[0:self.max_boxes_per_image]
|
216 |
+
obj_names = [] # used for making a sentence
|
217 |
+
boxes = torch.zeros(self.max_boxes_per_image, 4)
|
218 |
+
masks = torch.zeros(self.max_boxes_per_image)
|
219 |
+
positive_embeddings = torch.zeros(self.max_boxes_per_image, self.embedding_len)
|
220 |
+
for i, idx in enumerate(wanted_idxs):
|
221 |
+
obj_names.append( all_obj_names[idx] )
|
222 |
+
boxes[i] = all_boxes[idx]
|
223 |
+
masks[i] = all_masks[idx]
|
224 |
+
positive_embeddings[i] = all_positive_embeddings[idx]
|
225 |
+
|
226 |
+
# Caption
|
227 |
+
if random.uniform(0, 1) < self.prob_real_caption:
|
228 |
+
caption_data = self.image_id_to_captions[image_id]
|
229 |
+
idx = random.randint(0, len(caption_data)-1 )
|
230 |
+
caption = caption_data[idx]["caption"]
|
231 |
+
else:
|
232 |
+
if self.fake_caption_type == "empty":
|
233 |
+
caption = ""
|
234 |
+
else:
|
235 |
+
caption = make_a_sentence(obj_names, clean=True)
|
236 |
+
|
237 |
+
|
238 |
+
out["caption"] = caption
|
239 |
+
out["boxes"] = boxes
|
240 |
+
out["masks"] = masks
|
241 |
+
out["positive_embeddings"] = positive_embeddings
|
242 |
+
|
243 |
+
return out
|
244 |
+
|
245 |
+
|
246 |
+
def __len__(self):
|
247 |
+
if self.max_images is None:
|
248 |
+
return len(self.image_ids)
|
249 |
+
return min(len(self.image_ids), self.max_images)
|
250 |
+
|
dataset/concat_dataset.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .catalog import DatasetCatalog
|
2 |
+
from ldm.util import instantiate_from_config
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
class ConCatDataset():
|
9 |
+
def __init__(self, dataset_name_list, ROOT, which_embedder, train=True, repeats=None):
|
10 |
+
self.datasets = []
|
11 |
+
cul_previous_dataset_length = 0
|
12 |
+
offset_map = []
|
13 |
+
which_dataset = []
|
14 |
+
|
15 |
+
if repeats is None:
|
16 |
+
repeats = [1] * len(dataset_name_list)
|
17 |
+
else:
|
18 |
+
assert len(repeats) == len(dataset_name_list)
|
19 |
+
|
20 |
+
|
21 |
+
Catalog = DatasetCatalog(ROOT, which_embedder)
|
22 |
+
for dataset_idx, (dataset_name, yaml_params) in enumerate(dataset_name_list.items()):
|
23 |
+
repeat = repeats[dataset_idx]
|
24 |
+
|
25 |
+
dataset_dict = getattr(Catalog, dataset_name)
|
26 |
+
|
27 |
+
target = dataset_dict['target']
|
28 |
+
params = dataset_dict['train_params'] if train else dataset_dict['val_params']
|
29 |
+
if yaml_params is not None:
|
30 |
+
params.update(yaml_params)
|
31 |
+
dataset = instantiate_from_config( dict(target=target, params=params) )
|
32 |
+
|
33 |
+
self.datasets.append(dataset)
|
34 |
+
for _ in range(repeat):
|
35 |
+
offset_map.append( torch.ones(len(dataset))*cul_previous_dataset_length )
|
36 |
+
which_dataset.append( torch.ones(len(dataset))*dataset_idx )
|
37 |
+
cul_previous_dataset_length += len(dataset)
|
38 |
+
offset_map = torch.cat(offset_map, dim=0).long()
|
39 |
+
self.total_length = cul_previous_dataset_length
|
40 |
+
|
41 |
+
self.mapping = torch.arange(self.total_length) - offset_map
|
42 |
+
self.which_dataset = torch.cat(which_dataset, dim=0).long()
|
43 |
+
|
44 |
+
|
45 |
+
def total_images(self):
|
46 |
+
count = 0
|
47 |
+
for dataset in self.datasets:
|
48 |
+
print(dataset.total_images())
|
49 |
+
count += dataset.total_images()
|
50 |
+
return count
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
def __getitem__(self, idx):
|
55 |
+
dataset = self.datasets[ self.which_dataset[idx] ]
|
56 |
+
return dataset[ self.mapping[idx] ]
|
57 |
+
|
58 |
+
|
59 |
+
def __len__(self):
|
60 |
+
return self.total_length
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
|
dataset/grounding_dataset.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tkinter.messagebox import NO
|
2 |
+
import torch
|
3 |
+
import json
|
4 |
+
from collections import defaultdict
|
5 |
+
from PIL import Image, ImageDraw
|
6 |
+
from copy import deepcopy
|
7 |
+
import os
|
8 |
+
import torchvision.transforms as transforms
|
9 |
+
import torchvision
|
10 |
+
from .base_dataset import BaseDataset, check_filenames_in_zipdata, recalculate_box_and_verify_if_valid
|
11 |
+
from io import BytesIO
|
12 |
+
import random
|
13 |
+
|
14 |
+
def check_unique(images, fields):
|
15 |
+
for field in fields:
|
16 |
+
temp_list = []
|
17 |
+
for img_info in images:
|
18 |
+
temp_list.append(img_info[field])
|
19 |
+
assert len(set(temp_list)) == len(temp_list), field
|
20 |
+
|
21 |
+
def clean_data(data):
|
22 |
+
for data_info in data:
|
23 |
+
data_info.pop("original_img_id", None)
|
24 |
+
data_info.pop("original_id", None)
|
25 |
+
data_info.pop("sentence_id", None) # sentence id for each image (multiple sentences for one image)
|
26 |
+
data_info.pop("dataset_name", None)
|
27 |
+
data_info.pop("data_source", None)
|
28 |
+
data_info["data_id"] = data_info.pop("id")
|
29 |
+
|
30 |
+
|
31 |
+
def clean_annotations(annotations):
|
32 |
+
for anno_info in annotations:
|
33 |
+
anno_info.pop("iscrowd", None) # I have checked that all 0 for flickr, vg, coco
|
34 |
+
anno_info.pop("category_id", None) # I have checked that all 1 for flickr vg. This is not always 1 for coco, but I do not think we need this annotation
|
35 |
+
anno_info.pop("area", None)
|
36 |
+
# anno_info.pop("id", None)
|
37 |
+
anno_info["data_id"] = anno_info.pop("image_id")
|
38 |
+
|
39 |
+
|
40 |
+
def draw_box(img, boxes):
|
41 |
+
draw = ImageDraw.Draw(img)
|
42 |
+
for box in boxes:
|
43 |
+
draw.rectangle([box[0], box[1], box[2], box[3]], outline ="red", width=2) # x0 y0 x1 y1
|
44 |
+
return img
|
45 |
+
|
46 |
+
|
47 |
+
def xyhw2xyxy(box):
|
48 |
+
x0, y0, w, h = box
|
49 |
+
return [ x0, y0, x0+w, y0+h ]
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
class GroundingDataset(BaseDataset):
|
54 |
+
def __init__(self,
|
55 |
+
image_root,
|
56 |
+
json_path,
|
57 |
+
annotation_embedding_path,
|
58 |
+
prob_real_caption=1,
|
59 |
+
image_size=256,
|
60 |
+
min_box_size=0.01,
|
61 |
+
max_boxes_per_data=8,
|
62 |
+
max_images=None, # set as 30K used to eval
|
63 |
+
random_crop = False,
|
64 |
+
random_flip = True,
|
65 |
+
):
|
66 |
+
super().__init__(image_root, random_crop, random_flip, image_size)
|
67 |
+
self.image_root = image_root
|
68 |
+
self.json_path = json_path
|
69 |
+
self.annotation_embedding_path = annotation_embedding_path
|
70 |
+
self.prob_real_caption = prob_real_caption
|
71 |
+
self.min_box_size = min_box_size
|
72 |
+
self.max_boxes_per_data = max_boxes_per_data
|
73 |
+
self.max_images = max_images
|
74 |
+
|
75 |
+
|
76 |
+
# Load raw data
|
77 |
+
with open(json_path, 'r') as f:
|
78 |
+
json_raw = json.load(f) # keys: 'info', 'images', 'licenses', 'categories', 'annotations'
|
79 |
+
self.data = json_raw["images"] # donot name it images, which is misleading
|
80 |
+
self.annotations = json_raw["annotations"]
|
81 |
+
|
82 |
+
|
83 |
+
# Load preprocessed name embedding
|
84 |
+
if 'bert' in annotation_embedding_path:
|
85 |
+
self.embedding_len = 1280
|
86 |
+
elif 'clip' in annotation_embedding_path:
|
87 |
+
self.embedding_len = 768
|
88 |
+
else:
|
89 |
+
assert False
|
90 |
+
|
91 |
+
|
92 |
+
# clean data and annotation
|
93 |
+
check_unique( self.data, ['id'] )
|
94 |
+
check_unique( self.annotations, ['id'] )
|
95 |
+
clean_data(self.data)
|
96 |
+
clean_annotations(self.annotations)
|
97 |
+
self.data_id_list = [ datum['data_id'] for datum in self.data ]
|
98 |
+
self.data = { datum['data_id']:datum for datum in self.data } # map self.data from a list into a dict
|
99 |
+
|
100 |
+
|
101 |
+
# data point to its annotation mapping
|
102 |
+
self.data_id_to_annos = defaultdict(list)
|
103 |
+
for anno in self.annotations:
|
104 |
+
self.data_id_to_annos[ anno["data_id"] ].append(anno)
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
# These are not used that offen, but are useful in some cases
|
109 |
+
self.file_names = [] # all training images
|
110 |
+
self.file_name_to_data_ids = defaultdict(list) # for each image, there are multiple data points (captions)
|
111 |
+
for data_id in self.data_id_list:
|
112 |
+
fine_name = self.data[data_id]["file_name"]
|
113 |
+
self.file_names.append(fine_name)
|
114 |
+
self.file_name_to_data_ids[fine_name].append(data_id)
|
115 |
+
self.file_names = list(set(self.file_names))
|
116 |
+
|
117 |
+
|
118 |
+
if self.max_images is not None:
|
119 |
+
"This is only used as COCO2017P evulation, when we set max_images as 30k"
|
120 |
+
assert False, 'I have commented out the following code to save cpu memory'
|
121 |
+
# new_data_id_list = []
|
122 |
+
# new_file_name_to_data_ids = defaultdict(list)
|
123 |
+
# self.file_names = self.file_names[0:self.max_images]
|
124 |
+
# for file_name in self.file_names:
|
125 |
+
# data_id = self.file_name_to_data_ids[file_name][0]
|
126 |
+
# new_data_id_list.append(data_id)
|
127 |
+
# new_file_name_to_data_ids[file_name].append(data_id)
|
128 |
+
# self.data_id_list = new_data_id_list
|
129 |
+
# self.file_name_to_data_ids = new_file_name_to_data_ids
|
130 |
+
|
131 |
+
|
132 |
+
# Check if all filenames can be found in the zip file
|
133 |
+
# all_filenames = [self.data[idx]['file_name'] for idx in self.data_id_list ]
|
134 |
+
# check_filenames_in_zipdata(all_filenames, image_root)
|
135 |
+
|
136 |
+
|
137 |
+
def total_images(self):
|
138 |
+
return len(self.file_names)
|
139 |
+
|
140 |
+
|
141 |
+
def __getitem__(self, index):
|
142 |
+
if self.max_boxes_per_data > 99:
|
143 |
+
assert False, "Are you sure setting such large number of boxes?"
|
144 |
+
|
145 |
+
out = {}
|
146 |
+
|
147 |
+
data_id = self.data_id_list[index]
|
148 |
+
out['id'] = data_id
|
149 |
+
|
150 |
+
|
151 |
+
# Image and caption
|
152 |
+
file_name = self.data[data_id]['file_name']
|
153 |
+
image = self.fetch_image(file_name)
|
154 |
+
image_tensor, trans_info = self.transform_image(image)
|
155 |
+
out["image"] = image_tensor
|
156 |
+
|
157 |
+
if random.uniform(0, 1) < self.prob_real_caption:
|
158 |
+
out["caption"] = self.data[data_id]["caption"]
|
159 |
+
else:
|
160 |
+
out["caption"] = ""
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
annos = deepcopy(self.data_id_to_annos[data_id])
|
165 |
+
areas = []
|
166 |
+
all_boxes = []
|
167 |
+
all_masks = []
|
168 |
+
all_positive_embeddings = []
|
169 |
+
|
170 |
+
|
171 |
+
for anno in annos:
|
172 |
+
|
173 |
+
x, y, w, h = anno['bbox']
|
174 |
+
valid, (x0, y0, x1, y1) = recalculate_box_and_verify_if_valid(x, y, w, h, trans_info, self.image_size, self.min_box_size)
|
175 |
+
|
176 |
+
if valid:
|
177 |
+
areas.append( (x1-x0)*(y1-y0) )
|
178 |
+
all_boxes.append( torch.tensor([x0,y0,x1,y1]) / self.image_size ) # scale to 0-1
|
179 |
+
all_masks.append(1)
|
180 |
+
all_positive_embeddings.append( torch.load(os.path.join(self.annotation_embedding_path,str(anno["id"])), map_location='cpu' ) )
|
181 |
+
|
182 |
+
wanted_idxs = torch.tensor(areas).sort(descending=True)[1]
|
183 |
+
wanted_idxs = wanted_idxs[0:self.max_boxes_per_data]
|
184 |
+
|
185 |
+
boxes = torch.zeros(self.max_boxes_per_data, 4)
|
186 |
+
masks = torch.zeros(self.max_boxes_per_data)
|
187 |
+
positive_embeddings = torch.zeros(self.max_boxes_per_data, self.embedding_len)
|
188 |
+
for i, idx in enumerate(wanted_idxs):
|
189 |
+
boxes[i] = all_boxes[idx]
|
190 |
+
masks[i] = all_masks[idx]
|
191 |
+
positive_embeddings[i] = all_positive_embeddings[idx]
|
192 |
+
|
193 |
+
|
194 |
+
out["boxes"] = boxes
|
195 |
+
out["masks"] = masks
|
196 |
+
out["positive_embeddings"] = positive_embeddings
|
197 |
+
|
198 |
+
return out
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
def __len__(self):
|
203 |
+
return len(self.data_id_list)
|
204 |
+
|
205 |
+
|
dataset/layout_dataset.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json, os, random, math
|
2 |
+
from collections import defaultdict
|
3 |
+
from copy import deepcopy
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from torch.utils.data import Dataset
|
7 |
+
import torchvision.transforms as transforms
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
from PIL import Image, ImageOps
|
11 |
+
from .base_dataset import BaseDataset, check_filenames_in_zipdata
|
12 |
+
from io import BytesIO
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
def clean_annotations(annotations):
|
18 |
+
for anno in annotations:
|
19 |
+
anno.pop("segmentation", None)
|
20 |
+
anno.pop("area", None)
|
21 |
+
anno.pop("iscrowd", None)
|
22 |
+
anno.pop("id", None)
|
23 |
+
|
24 |
+
|
25 |
+
def make_a_sentence(obj_names, clean=False):
|
26 |
+
|
27 |
+
if clean:
|
28 |
+
obj_names = [ name[:-6] if ("-other" in name) else name for name in obj_names]
|
29 |
+
|
30 |
+
caption = ""
|
31 |
+
tokens_positive = []
|
32 |
+
for obj_name in obj_names:
|
33 |
+
start_len = len(caption)
|
34 |
+
caption += obj_name
|
35 |
+
end_len = len(caption)
|
36 |
+
caption += ", "
|
37 |
+
tokens_positive.append(
|
38 |
+
[[start_len, end_len]] # in real caption, positive tokens can be disjoint, thus using list of list
|
39 |
+
)
|
40 |
+
caption = caption[:-2] # remove last ", "
|
41 |
+
|
42 |
+
return caption #, tokens_positive
|
43 |
+
|
44 |
+
|
45 |
+
class LayoutDataset(BaseDataset):
|
46 |
+
"""
|
47 |
+
Note: this dataset can somehow be achieved in cd_dataset.CDDataset
|
48 |
+
Since if you donot set prob_real_caption=0 in CDDataset, then that
|
49 |
+
dataset will only use detection annotations. However, in that dataset,
|
50 |
+
we do not remove images but remove boxes.
|
51 |
+
|
52 |
+
However, in layout2img works, people will just resize raw image data into 256*256,
|
53 |
+
thus they pre-calculate box size and apply min_box_size before min/max_boxes_per_image.
|
54 |
+
And then they will remove images if does not follow the rule.
|
55 |
+
|
56 |
+
These two different methods will lead to different number of training/val images.
|
57 |
+
Thus this dataset here is only for layout2img.
|
58 |
+
|
59 |
+
"""
|
60 |
+
def __init__(self,
|
61 |
+
image_root,
|
62 |
+
instances_json_path,
|
63 |
+
stuff_json_path,
|
64 |
+
category_embedding_path,
|
65 |
+
fake_caption_type = 'empty',
|
66 |
+
image_size=256,
|
67 |
+
max_samples=None,
|
68 |
+
min_box_size=0.02,
|
69 |
+
min_boxes_per_image=3,
|
70 |
+
max_boxes_per_image=8,
|
71 |
+
include_other=False,
|
72 |
+
random_flip=True
|
73 |
+
):
|
74 |
+
super().__init__(random_crop=None, random_flip=None, image_size=None) # we only use vis_getitem func in BaseDataset, donot use the others.
|
75 |
+
|
76 |
+
assert fake_caption_type in ['empty', 'made']
|
77 |
+
self.image_root = image_root
|
78 |
+
self.instances_json_path = instances_json_path
|
79 |
+
self.stuff_json_path = stuff_json_path
|
80 |
+
self.category_embedding_path = category_embedding_path
|
81 |
+
self.fake_caption_type = fake_caption_type
|
82 |
+
self.image_size = image_size
|
83 |
+
self.max_samples = max_samples
|
84 |
+
self.min_box_size = min_box_size
|
85 |
+
self.min_boxes_per_image = min_boxes_per_image
|
86 |
+
self.max_boxes_per_image = max_boxes_per_image
|
87 |
+
self.include_other = include_other
|
88 |
+
self.random_flip = random_flip
|
89 |
+
|
90 |
+
|
91 |
+
self.transform = transforms.Compose([transforms.Resize( (image_size, image_size) ),
|
92 |
+
transforms.ToTensor(),
|
93 |
+
transforms.Lambda(lambda t: (t * 2) - 1) ])
|
94 |
+
|
95 |
+
# Load all jsons
|
96 |
+
with open(instances_json_path, 'r') as f:
|
97 |
+
instances_data = json.load(f) # keys: 'info', 'images', 'licenses', 'categories', 'annotations'
|
98 |
+
clean_annotations(instances_data["annotations"])
|
99 |
+
self.instances_data = instances_data
|
100 |
+
|
101 |
+
with open(stuff_json_path, 'r') as f:
|
102 |
+
stuff_data = json.load(f) # keys: 'info', 'images', 'licenses', 'categories', 'annotations'
|
103 |
+
clean_annotations(stuff_data["annotations"])
|
104 |
+
self.stuff_data = stuff_data
|
105 |
+
|
106 |
+
|
107 |
+
# Load preprocessed name embedding
|
108 |
+
self.category_embeddings = torch.load(category_embedding_path)
|
109 |
+
self.embedding_len = list( self.category_embeddings.values() )[0].shape[0]
|
110 |
+
|
111 |
+
|
112 |
+
# Misc
|
113 |
+
self.image_ids = [] # main list for selecting images
|
114 |
+
self.image_id_to_filename = {} # file names used to read image
|
115 |
+
self.image_id_to_size = {} # original size of this image
|
116 |
+
assert instances_data['images'] == stuff_data["images"]
|
117 |
+
for image_data in instances_data['images']:
|
118 |
+
image_id = image_data['id']
|
119 |
+
filename = image_data['file_name']
|
120 |
+
width = image_data['width']
|
121 |
+
height = image_data['height']
|
122 |
+
self.image_ids.append(image_id)
|
123 |
+
self.image_id_to_filename[image_id] = filename
|
124 |
+
self.image_id_to_size[image_id] = (width, height)
|
125 |
+
|
126 |
+
# All category names (including things and stuff)
|
127 |
+
self.things_id_list = []
|
128 |
+
self.stuff_id_list = []
|
129 |
+
self.object_idx_to_name = {}
|
130 |
+
for category_data in instances_data['categories']:
|
131 |
+
self.things_id_list.append( category_data['id'] )
|
132 |
+
self.object_idx_to_name[category_data['id']] = category_data['name']
|
133 |
+
for category_data in stuff_data['categories']:
|
134 |
+
self.stuff_id_list.append( category_data['id'] )
|
135 |
+
self.object_idx_to_name[category_data['id']] = category_data['name']
|
136 |
+
self.all_categories = [ self.object_idx_to_name.get(k, None) for k in range(183+1) ]
|
137 |
+
|
138 |
+
|
139 |
+
# Add object data from instances and stuff
|
140 |
+
self.image_id_to_objects = defaultdict(list)
|
141 |
+
self.select_objects( instances_data['annotations'] )
|
142 |
+
self.select_objects( stuff_data['annotations'] )
|
143 |
+
|
144 |
+
|
145 |
+
# Prune images that have too few or too many objects
|
146 |
+
new_image_ids = []
|
147 |
+
for image_id in self.image_ids:
|
148 |
+
num_objs = len(self.image_id_to_objects[image_id])
|
149 |
+
if self.min_boxes_per_image <= num_objs <= self.max_boxes_per_image:
|
150 |
+
new_image_ids.append(image_id)
|
151 |
+
self.image_ids = new_image_ids
|
152 |
+
|
153 |
+
|
154 |
+
# Check if all filenames can be found in the zip file
|
155 |
+
all_filenames = [self.image_id_to_filename[idx] for idx in self.image_ids]
|
156 |
+
check_filenames_in_zipdata(all_filenames, image_root)
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
def select_objects(self, annotations):
|
161 |
+
for object_anno in annotations:
|
162 |
+
image_id = object_anno['image_id']
|
163 |
+
_, _, w, h = object_anno['bbox']
|
164 |
+
W, H = self.image_id_to_size[image_id]
|
165 |
+
box_area = (w * h) / (W * H)
|
166 |
+
box_ok = box_area > self.min_box_size
|
167 |
+
object_name = self.object_idx_to_name[object_anno['category_id']]
|
168 |
+
other_ok = object_name != 'other' or self.include_other
|
169 |
+
if box_ok and other_ok:
|
170 |
+
self.image_id_to_objects[image_id].append(object_anno)
|
171 |
+
|
172 |
+
|
173 |
+
def total_images(self):
|
174 |
+
return len(self)
|
175 |
+
|
176 |
+
|
177 |
+
def __getitem__(self, index):
|
178 |
+
if self.max_boxes_per_image > 99:
|
179 |
+
assert False, "Are you sure setting such large number of boxes?"
|
180 |
+
|
181 |
+
out = {}
|
182 |
+
|
183 |
+
image_id = self.image_ids[index]
|
184 |
+
out['id'] = image_id
|
185 |
+
|
186 |
+
flip = self.random_flip and random.random()<0.5
|
187 |
+
|
188 |
+
# Image
|
189 |
+
filename = self.image_id_to_filename[image_id]
|
190 |
+
zip_file = self.fetch_zipfile(self.image_root)
|
191 |
+
image = Image.open(BytesIO(zip_file.read(filename))).convert('RGB')
|
192 |
+
WW, HH = image.size
|
193 |
+
if flip:
|
194 |
+
image = ImageOps.mirror(image)
|
195 |
+
out["image"] = self.transform(image)
|
196 |
+
|
197 |
+
this_image_obj_annos = deepcopy(self.image_id_to_objects[image_id])
|
198 |
+
|
199 |
+
# Make a sentence
|
200 |
+
obj_names = [] # used for make a sentence
|
201 |
+
boxes = torch.zeros(self.max_boxes_per_image, 4)
|
202 |
+
masks = torch.zeros(self.max_boxes_per_image)
|
203 |
+
positive_embeddings = torch.zeros(self.max_boxes_per_image, self.embedding_len)
|
204 |
+
for idx, object_anno in enumerate(this_image_obj_annos):
|
205 |
+
obj_name = self.object_idx_to_name[ object_anno['category_id'] ]
|
206 |
+
obj_names.append(obj_name)
|
207 |
+
x, y, w, h = object_anno['bbox']
|
208 |
+
x0 = x / WW
|
209 |
+
y0 = y / HH
|
210 |
+
x1 = (x + w) / WW
|
211 |
+
y1 = (y + h) / HH
|
212 |
+
if flip:
|
213 |
+
x0, x1 = 1-x1, 1-x0
|
214 |
+
boxes[idx] = torch.tensor([x0,y0,x1,y1])
|
215 |
+
masks[idx] = 1
|
216 |
+
positive_embeddings[idx] = self.category_embeddings[obj_name]
|
217 |
+
|
218 |
+
if self.fake_caption_type == 'empty':
|
219 |
+
caption = ""
|
220 |
+
else:
|
221 |
+
caption = make_a_sentence(obj_names, clean=True)
|
222 |
+
|
223 |
+
out["caption"] = caption
|
224 |
+
out["boxes"] = boxes
|
225 |
+
out["masks"] = masks
|
226 |
+
out["positive_embeddings"] = positive_embeddings
|
227 |
+
|
228 |
+
|
229 |
+
return out
|
230 |
+
|
231 |
+
|
232 |
+
def __len__(self):
|
233 |
+
if self.max_samples is None:
|
234 |
+
return len(self.image_ids)
|
235 |
+
return min(len(self.image_ids), self.max_samples)
|
236 |
+
|
237 |
+
|
dataset/tsv.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import os.path as op
|
3 |
+
import gc
|
4 |
+
import json
|
5 |
+
from typing import List
|
6 |
+
import logging
|
7 |
+
|
8 |
+
try:
|
9 |
+
from .blob_storage import BlobStorage, disk_usage
|
10 |
+
except:
|
11 |
+
class BlobStorage:
|
12 |
+
pass
|
13 |
+
|
14 |
+
|
15 |
+
def generate_lineidx(filein: str, idxout: str) -> None:
|
16 |
+
idxout_tmp = idxout + '.tmp'
|
17 |
+
with open(filein, 'r') as tsvin, open(idxout_tmp, 'w') as tsvout:
|
18 |
+
fsize = os.fstat(tsvin.fileno()).st_size
|
19 |
+
fpos = 0
|
20 |
+
while fpos != fsize:
|
21 |
+
tsvout.write(str(fpos) + "\n")
|
22 |
+
tsvin.readline()
|
23 |
+
fpos = tsvin.tell()
|
24 |
+
os.rename(idxout_tmp, idxout)
|
25 |
+
|
26 |
+
|
27 |
+
def read_to_character(fp, c):
|
28 |
+
result = []
|
29 |
+
while True:
|
30 |
+
s = fp.read(32)
|
31 |
+
assert s != ''
|
32 |
+
if c in s:
|
33 |
+
result.append(s[: s.index(c)])
|
34 |
+
break
|
35 |
+
else:
|
36 |
+
result.append(s)
|
37 |
+
return ''.join(result)
|
38 |
+
|
39 |
+
|
40 |
+
class TSVFile(object):
|
41 |
+
def __init__(self,
|
42 |
+
tsv_file: str,
|
43 |
+
if_generate_lineidx: bool = False,
|
44 |
+
lineidx: str = None,
|
45 |
+
class_selector: List[str] = None,
|
46 |
+
blob_storage: BlobStorage = None):
|
47 |
+
self.tsv_file = tsv_file
|
48 |
+
self.lineidx = op.splitext(tsv_file)[0] + '.lineidx' \
|
49 |
+
if not lineidx else lineidx
|
50 |
+
self.linelist = op.splitext(tsv_file)[0] + '.linelist'
|
51 |
+
self.chunks = op.splitext(tsv_file)[0] + '.chunks'
|
52 |
+
self._fp = None
|
53 |
+
self._lineidx = None
|
54 |
+
self._sample_indices = None
|
55 |
+
self._class_boundaries = None
|
56 |
+
self._class_selector = class_selector
|
57 |
+
self._blob_storage = blob_storage
|
58 |
+
self._len = None
|
59 |
+
# the process always keeps the process which opens the file.
|
60 |
+
# If the pid is not equal to the currrent pid, we will re-open the file.
|
61 |
+
self.pid = None
|
62 |
+
# generate lineidx if not exist
|
63 |
+
if not op.isfile(self.lineidx) and if_generate_lineidx:
|
64 |
+
generate_lineidx(self.tsv_file, self.lineidx)
|
65 |
+
|
66 |
+
def __del__(self):
|
67 |
+
self.gcidx()
|
68 |
+
if self._fp:
|
69 |
+
self._fp.close()
|
70 |
+
# physically remove the tsv file if it is retrieved by BlobStorage
|
71 |
+
if self._blob_storage and 'azcopy' in self.tsv_file and os.path.exists(self.tsv_file):
|
72 |
+
try:
|
73 |
+
original_usage = disk_usage('/')
|
74 |
+
os.remove(self.tsv_file)
|
75 |
+
logging.info("Purged %s (disk usage: %.2f%% => %.2f%%)" %
|
76 |
+
(self.tsv_file, original_usage, disk_usage('/') * 100))
|
77 |
+
except:
|
78 |
+
# Known issue: multiple threads attempting to delete the file will raise a FileNotFound error.
|
79 |
+
# TODO: try Threadling.Lock to better handle the race condition
|
80 |
+
pass
|
81 |
+
|
82 |
+
def __str__(self):
|
83 |
+
return "TSVFile(tsv_file='{}')".format(self.tsv_file)
|
84 |
+
|
85 |
+
def __repr__(self):
|
86 |
+
return str(self)
|
87 |
+
|
88 |
+
def gcidx(self):
|
89 |
+
logging.debug('Run gc collect')
|
90 |
+
self._lineidx = None
|
91 |
+
self._sample_indices = None
|
92 |
+
#self._class_boundaries = None
|
93 |
+
return gc.collect()
|
94 |
+
|
95 |
+
def get_class_boundaries(self):
|
96 |
+
return self._class_boundaries
|
97 |
+
|
98 |
+
def num_rows(self, gcf=False):
|
99 |
+
if (self._len is None):
|
100 |
+
self._ensure_lineidx_loaded()
|
101 |
+
retval = len(self._sample_indices)
|
102 |
+
|
103 |
+
if (gcf):
|
104 |
+
self.gcidx()
|
105 |
+
|
106 |
+
self._len = retval
|
107 |
+
|
108 |
+
return self._len
|
109 |
+
|
110 |
+
def seek(self, idx: int):
|
111 |
+
self._ensure_tsv_opened()
|
112 |
+
self._ensure_lineidx_loaded()
|
113 |
+
try:
|
114 |
+
pos = self._lineidx[self._sample_indices[idx]]
|
115 |
+
except:
|
116 |
+
logging.info('=> {}-{}'.format(self.tsv_file, idx))
|
117 |
+
raise
|
118 |
+
self._fp.seek(pos)
|
119 |
+
return [s.strip() for s in self._fp.readline().split('\t')]
|
120 |
+
|
121 |
+
def seek_first_column(self, idx: int):
|
122 |
+
self._ensure_tsv_opened()
|
123 |
+
self._ensure_lineidx_loaded()
|
124 |
+
pos = self._lineidx[idx]
|
125 |
+
self._fp.seek(pos)
|
126 |
+
return read_to_character(self._fp, '\t')
|
127 |
+
|
128 |
+
def get_key(self, idx: int):
|
129 |
+
return self.seek_first_column(idx)
|
130 |
+
|
131 |
+
def __getitem__(self, index: int):
|
132 |
+
return self.seek(index)
|
133 |
+
|
134 |
+
def __len__(self):
|
135 |
+
return self.num_rows()
|
136 |
+
|
137 |
+
def _ensure_lineidx_loaded(self):
|
138 |
+
if self._lineidx is None:
|
139 |
+
logging.debug('=> loading lineidx: {}'.format(self.lineidx))
|
140 |
+
with open(self.lineidx, 'r') as fp:
|
141 |
+
lines = fp.readlines()
|
142 |
+
lines = [line.strip() for line in lines]
|
143 |
+
self._lineidx = [int(line) for line in lines]
|
144 |
+
|
145 |
+
# read the line list if exists
|
146 |
+
linelist = None
|
147 |
+
if op.isfile(self.linelist):
|
148 |
+
with open(self.linelist, 'r') as fp:
|
149 |
+
linelist = sorted(
|
150 |
+
[
|
151 |
+
int(line.strip())
|
152 |
+
for line in fp.readlines()
|
153 |
+
]
|
154 |
+
)
|
155 |
+
|
156 |
+
if op.isfile(self.chunks):
|
157 |
+
self._sample_indices = []
|
158 |
+
self._class_boundaries = []
|
159 |
+
class_boundaries = json.load(open(self.chunks, 'r'))
|
160 |
+
for class_name, boundary in class_boundaries.items():
|
161 |
+
start = len(self._sample_indices)
|
162 |
+
if class_name in self._class_selector:
|
163 |
+
for idx in range(boundary[0], boundary[1] + 1):
|
164 |
+
# NOTE: potentially slow when linelist is long, try to speed it up
|
165 |
+
if linelist and idx not in linelist:
|
166 |
+
continue
|
167 |
+
self._sample_indices.append(idx)
|
168 |
+
end = len(self._sample_indices)
|
169 |
+
self._class_boundaries.append((start, end))
|
170 |
+
else:
|
171 |
+
if linelist:
|
172 |
+
self._sample_indices = linelist
|
173 |
+
else:
|
174 |
+
self._sample_indices = list(range(len(self._lineidx)))
|
175 |
+
|
176 |
+
def _ensure_tsv_opened(self):
|
177 |
+
if self._fp is None:
|
178 |
+
if self._blob_storage:
|
179 |
+
self._fp = self._blob_storage.open(self.tsv_file)
|
180 |
+
else:
|
181 |
+
self._fp = open(self.tsv_file, 'r')
|
182 |
+
self.pid = os.getpid()
|
183 |
+
|
184 |
+
if self.pid != os.getpid():
|
185 |
+
logging.debug('=> re-open {} because the process id changed'.format(self.tsv_file))
|
186 |
+
self._fp = open(self.tsv_file, 'r')
|
187 |
+
self.pid = os.getpid()
|
188 |
+
|
189 |
+
|
190 |
+
class TSVWriter(object):
|
191 |
+
def __init__(self, tsv_file):
|
192 |
+
self.tsv_file = tsv_file
|
193 |
+
self.lineidx_file = op.splitext(tsv_file)[0] + '.lineidx'
|
194 |
+
self.tsv_file_tmp = self.tsv_file + '.tmp'
|
195 |
+
self.lineidx_file_tmp = self.lineidx_file + '.tmp'
|
196 |
+
|
197 |
+
self.tsv_fp = open(self.tsv_file_tmp, 'w')
|
198 |
+
self.lineidx_fp = open(self.lineidx_file_tmp, 'w')
|
199 |
+
|
200 |
+
self.idx = 0
|
201 |
+
|
202 |
+
def write(self, values, sep='\t'):
|
203 |
+
v = '{0}\n'.format(sep.join(map(str, values)))
|
204 |
+
self.tsv_fp.write(v)
|
205 |
+
self.lineidx_fp.write(str(self.idx) + '\n')
|
206 |
+
self.idx = self.idx + len(v)
|
207 |
+
|
208 |
+
def close(self):
|
209 |
+
self.tsv_fp.close()
|
210 |
+
self.lineidx_fp.close()
|
211 |
+
os.rename(self.tsv_file_tmp, self.tsv_file)
|
212 |
+
os.rename(self.lineidx_file_tmp, self.lineidx_file)
|
dataset/tsv_dataset.py
ADDED
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tkinter.messagebox import NO
|
2 |
+
import torch
|
3 |
+
import json
|
4 |
+
from collections import defaultdict
|
5 |
+
from PIL import Image, ImageDraw
|
6 |
+
from copy import deepcopy
|
7 |
+
import os
|
8 |
+
import torchvision.transforms as transforms
|
9 |
+
import torchvision
|
10 |
+
from .base_dataset import BaseDataset, check_filenames_in_zipdata, recalculate_box_and_verify_if_valid
|
11 |
+
from io import BytesIO
|
12 |
+
import random
|
13 |
+
|
14 |
+
from .tsv import TSVFile
|
15 |
+
|
16 |
+
from io import BytesIO
|
17 |
+
import base64
|
18 |
+
from PIL import Image
|
19 |
+
import numpy as np
|
20 |
+
|
21 |
+
|
22 |
+
def decode_base64_to_pillow(image_b64):
|
23 |
+
return Image.open(BytesIO(base64.b64decode(image_b64))).convert('RGB')
|
24 |
+
|
25 |
+
def decode_tensor_from_string(arr_str, use_tensor=True):
|
26 |
+
arr = np.frombuffer(base64.b64decode(arr_str), dtype='float32')
|
27 |
+
if use_tensor:
|
28 |
+
arr = torch.from_numpy(arr)
|
29 |
+
return arr
|
30 |
+
|
31 |
+
def decode_item(item):
|
32 |
+
item = json.loads(item)
|
33 |
+
item['image'] = decode_base64_to_pillow(item['image'])
|
34 |
+
|
35 |
+
for anno in item['annos']:
|
36 |
+
anno['image_embedding_before'] = decode_tensor_from_string(anno['image_embedding_before'])
|
37 |
+
anno['text_embedding_before'] = decode_tensor_from_string(anno['text_embedding_before'])
|
38 |
+
anno['image_embedding_after'] = decode_tensor_from_string(anno['image_embedding_after'])
|
39 |
+
anno['text_embedding_after'] = decode_tensor_from_string(anno['text_embedding_after'])
|
40 |
+
return item
|
41 |
+
|
42 |
+
def check_unique(images, fields):
|
43 |
+
for field in fields:
|
44 |
+
temp_list = []
|
45 |
+
for img_info in images:
|
46 |
+
temp_list.append(img_info[field])
|
47 |
+
assert len(set(temp_list)) == len(temp_list), field
|
48 |
+
|
49 |
+
def clean_data(data):
|
50 |
+
for data_info in data:
|
51 |
+
data_info.pop("original_img_id", None)
|
52 |
+
data_info.pop("original_id", None)
|
53 |
+
data_info.pop("sentence_id", None) # sentence id for each image (multiple sentences for one image)
|
54 |
+
data_info.pop("dataset_name", None)
|
55 |
+
data_info.pop("data_source", None)
|
56 |
+
data_info["data_id"] = data_info.pop("id")
|
57 |
+
|
58 |
+
|
59 |
+
def clean_annotations(annotations):
|
60 |
+
for anno_info in annotations:
|
61 |
+
anno_info.pop("iscrowd", None) # I have checked that all 0 for flickr, vg, coco
|
62 |
+
anno_info.pop("category_id", None) # I have checked that all 1 for flickr vg. This is not always 1 for coco, but I do not think we need this annotation
|
63 |
+
anno_info.pop("area", None)
|
64 |
+
# anno_info.pop("id", None)
|
65 |
+
anno_info["data_id"] = anno_info.pop("image_id")
|
66 |
+
|
67 |
+
|
68 |
+
def draw_box(img, boxes):
|
69 |
+
draw = ImageDraw.Draw(img)
|
70 |
+
for box in boxes:
|
71 |
+
draw.rectangle([box[0], box[1], box[2], box[3]], outline ="red", width=2) # x0 y0 x1 y1
|
72 |
+
return img
|
73 |
+
|
74 |
+
|
75 |
+
def xyhw2xyxy(box):
|
76 |
+
x0, y0, w, h = box
|
77 |
+
return [ x0, y0, x0+w, y0+h ]
|
78 |
+
|
79 |
+
|
80 |
+
def make_a_sentence(obj_names, clean=False):
|
81 |
+
|
82 |
+
if clean:
|
83 |
+
obj_names = [ name[:-6] if ("-other" in name) else name for name in obj_names]
|
84 |
+
|
85 |
+
caption = ""
|
86 |
+
tokens_positive = []
|
87 |
+
for obj_name in obj_names:
|
88 |
+
start_len = len(caption)
|
89 |
+
caption += obj_name
|
90 |
+
end_len = len(caption)
|
91 |
+
caption += ", "
|
92 |
+
tokens_positive.append(
|
93 |
+
[[start_len, end_len]] # in real caption, positive tokens can be disjoint, thus using list of list
|
94 |
+
)
|
95 |
+
caption = caption[:-2] # remove last ", "
|
96 |
+
|
97 |
+
return caption #, tokens_positive
|
98 |
+
|
99 |
+
|
100 |
+
def mask_for_random_drop_text_or_image_feature(masks, random_drop_embedding):
|
101 |
+
"""
|
102 |
+
input masks tell how many valid grounding tokens for this image
|
103 |
+
e.g., 1,1,1,1,0,0,0,0,0,0...
|
104 |
+
|
105 |
+
If random_drop_embedding=both. we will random drop either image or
|
106 |
+
text feature for each token,
|
107 |
+
but we always make sure there is at least one feature used.
|
108 |
+
In other words, the following masks are not valid
|
109 |
+
(because for the second obj, no feature at all):
|
110 |
+
image: 1,0,1,1,0,0,0,0,0
|
111 |
+
text: 1,0,0,0,0,0,0,0,0
|
112 |
+
|
113 |
+
if random_drop_embedding=image. we will random drop image feature
|
114 |
+
and always keep the text one.
|
115 |
+
|
116 |
+
"""
|
117 |
+
N = masks.shape[0]
|
118 |
+
|
119 |
+
if random_drop_embedding=='both':
|
120 |
+
temp_mask = torch.ones(2,N)
|
121 |
+
for i in range(N):
|
122 |
+
if random.uniform(0, 1) < 0.5: # else keep both features
|
123 |
+
idx = random.sample([0,1], 1)[0] # randomly choose to drop image or text feature
|
124 |
+
temp_mask[idx,i] = 0
|
125 |
+
image_masks = temp_mask[0]*masks
|
126 |
+
text_masks = temp_mask[1]*masks
|
127 |
+
|
128 |
+
if random_drop_embedding=='image':
|
129 |
+
image_masks = masks*(torch.rand(N)>0.5)*1
|
130 |
+
text_masks = masks
|
131 |
+
|
132 |
+
return image_masks, text_masks
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
def project(x, projection_matrix):
|
139 |
+
"""
|
140 |
+
x (Batch*768) should be the penultimate feature of CLIP (before projection)
|
141 |
+
projection_matrix (768*768) is the CLIP projection matrix, which should be weight.data of Linear layer
|
142 |
+
defined in CLIP (out_dim, in_dim), thus we need to apply transpose below.
|
143 |
+
this function will return the CLIP feature (without normalziation)
|
144 |
+
"""
|
145 |
+
return x@torch.transpose(projection_matrix, 0, 1)
|
146 |
+
|
147 |
+
|
148 |
+
def inv_project(y, projection_matrix):
|
149 |
+
"""
|
150 |
+
y (Batch*768) should be the CLIP feature (after projection)
|
151 |
+
projection_matrix (768*768) is the CLIP projection matrix, which should be weight.data of Linear layer
|
152 |
+
defined in CLIP (out_dim, in_dim).
|
153 |
+
this function will return the CLIP penultimate feature.
|
154 |
+
|
155 |
+
Note: to make sure getting the correct penultimate feature, the input y should not be normalized.
|
156 |
+
If it is normalized, then the result will be scaled by CLIP feature norm, which is unknown.
|
157 |
+
"""
|
158 |
+
return y@torch.transpose(torch.linalg.inv(projection_matrix), 0, 1)
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
class TSVDataset(BaseDataset):
|
164 |
+
def __init__(self,
|
165 |
+
tsv_path,
|
166 |
+
which_embedder='clip',
|
167 |
+
which_layer=['after','after'], # text and image
|
168 |
+
prob_use_caption=1,
|
169 |
+
random_drop_embedding='none',
|
170 |
+
image_size=256,
|
171 |
+
min_box_size=0.01,
|
172 |
+
max_boxes_per_data=8,
|
173 |
+
max_images=None, # set as 30K used to eval
|
174 |
+
random_crop = False,
|
175 |
+
random_flip = True,
|
176 |
+
):
|
177 |
+
image_root = "a placeholder path as we are using tsv here"
|
178 |
+
super().__init__(image_root, random_crop, random_flip, image_size)
|
179 |
+
self.tsv_path = tsv_path
|
180 |
+
self.which_embedder = which_embedder
|
181 |
+
self.prob_use_caption = prob_use_caption
|
182 |
+
self.random_drop_embedding = random_drop_embedding
|
183 |
+
self.min_box_size = min_box_size
|
184 |
+
self.max_boxes_per_data = max_boxes_per_data
|
185 |
+
self.max_images = max_images
|
186 |
+
|
187 |
+
assert which_layer in [ ['after','after'], ['before','after_renorm'], ['before','after_reproject'] ]
|
188 |
+
assert random_drop_embedding in ['none', 'both', 'image']
|
189 |
+
self.which_layer_text = which_layer[0]
|
190 |
+
self.which_layer_image = which_layer[1]
|
191 |
+
|
192 |
+
#self.projection_matrix = torch.load(os.path.join(os.path.dirname(__file__), 'projection_matrix') )
|
193 |
+
self.projection_matrix = torch.load('projection_matrix.pth')
|
194 |
+
|
195 |
+
# Load tsv data
|
196 |
+
self.tsv_file = TSVFile(self.tsv_path)
|
197 |
+
|
198 |
+
|
199 |
+
# Load preprocessed name embedding
|
200 |
+
if which_embedder == 'bert':
|
201 |
+
self.embedding_len = 1280
|
202 |
+
elif which_embedder == 'clip':
|
203 |
+
self.embedding_len = 768
|
204 |
+
else:
|
205 |
+
assert False
|
206 |
+
|
207 |
+
def total_images(self):
|
208 |
+
return len(self)
|
209 |
+
|
210 |
+
def get_item_from_tsv(self, index):
|
211 |
+
_, item = self.tsv_file[index]
|
212 |
+
item = decode_item(item)
|
213 |
+
return item
|
214 |
+
|
215 |
+
|
216 |
+
def mapping(self, image_embedding):
|
217 |
+
if self.which_layer_image == 'after':
|
218 |
+
# both use CLIP aligned feature
|
219 |
+
return image_embedding
|
220 |
+
elif self.which_layer_image == 'after_renorm':
|
221 |
+
# text use before, but image use after projection but normalize to 28.7
|
222 |
+
return image_embedding*28.7
|
223 |
+
elif self.which_layer_image == 'after_reproject':
|
224 |
+
image_embedding = project( image_embedding.unsqueeze(0), self.projection_matrix.T )
|
225 |
+
image_embedding = image_embedding.squeeze(0)
|
226 |
+
image_embedding = image_embedding / image_embedding.norm()
|
227 |
+
image_embedding = image_embedding * 28.7
|
228 |
+
return image_embedding
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
def __getitem__(self, index):
|
233 |
+
if self.max_boxes_per_data > 99:
|
234 |
+
assert False, "Are you sure setting such large number of boxes?"
|
235 |
+
|
236 |
+
raw_item = self.get_item_from_tsv(index)
|
237 |
+
is_det = raw_item.get('is_det', False) # if it is from detection (such as o365), then we will make a caption
|
238 |
+
|
239 |
+
out = {}
|
240 |
+
|
241 |
+
# -------------------- id and image ------------------- #
|
242 |
+
out['id'] = raw_item['data_id']
|
243 |
+
image = raw_item['image']
|
244 |
+
image_tensor, trans_info = self.transform_image(image)
|
245 |
+
out["image"] = image_tensor
|
246 |
+
|
247 |
+
|
248 |
+
|
249 |
+
# -------------------- grounding token ------------------- #
|
250 |
+
annos = raw_item['annos']
|
251 |
+
|
252 |
+
areas = []
|
253 |
+
all_boxes = []
|
254 |
+
all_masks = []
|
255 |
+
all_text_embeddings = []
|
256 |
+
all_image_embeddings = []
|
257 |
+
if is_det:
|
258 |
+
all_category_names = []
|
259 |
+
|
260 |
+
text_embedding_name = 'text_embedding_before' if self.which_layer_text == 'before' else 'text_embedding_after'
|
261 |
+
image_embedding_name = 'image_embedding_after'
|
262 |
+
|
263 |
+
for anno in annos:
|
264 |
+
x, y, w, h = anno['bbox']
|
265 |
+
valid, (x0, y0, x1, y1) = recalculate_box_and_verify_if_valid(x, y, w, h, trans_info, self.image_size, self.min_box_size)
|
266 |
+
|
267 |
+
if valid:
|
268 |
+
areas.append( (x1-x0)*(y1-y0) )
|
269 |
+
all_boxes.append( torch.tensor([x0,y0,x1,y1]) / self.image_size ) # scale to 0-1
|
270 |
+
all_masks.append(1)
|
271 |
+
all_text_embeddings.append(anno[text_embedding_name])
|
272 |
+
all_image_embeddings.append( self.mapping(anno[image_embedding_name]) )
|
273 |
+
if is_det:
|
274 |
+
all_category_names.append(anno["category_name"])
|
275 |
+
|
276 |
+
|
277 |
+
wanted_idxs = torch.tensor(areas).sort(descending=True)[1]
|
278 |
+
wanted_idxs = wanted_idxs[0:self.max_boxes_per_data]
|
279 |
+
|
280 |
+
boxes = torch.zeros(self.max_boxes_per_data, 4)
|
281 |
+
masks = torch.zeros(self.max_boxes_per_data)
|
282 |
+
text_embeddings = torch.zeros(self.max_boxes_per_data, self.embedding_len)
|
283 |
+
image_embeddings = torch.zeros(self.max_boxes_per_data, self.embedding_len)
|
284 |
+
if is_det:
|
285 |
+
category_names = []
|
286 |
+
for i, idx in enumerate(wanted_idxs):
|
287 |
+
boxes[i] = all_boxes[idx]
|
288 |
+
masks[i] = all_masks[idx]
|
289 |
+
text_embeddings[i] = all_text_embeddings[idx]
|
290 |
+
image_embeddings[i] = all_image_embeddings[idx]
|
291 |
+
if is_det:
|
292 |
+
category_names.append(all_category_names[idx])
|
293 |
+
|
294 |
+
if self.random_drop_embedding != 'none':
|
295 |
+
image_masks, text_masks = mask_for_random_drop_text_or_image_feature(masks, self.random_drop_embedding)
|
296 |
+
else:
|
297 |
+
image_masks = masks
|
298 |
+
text_masks = masks
|
299 |
+
|
300 |
+
|
301 |
+
out["boxes"] = boxes
|
302 |
+
out["masks"] = masks
|
303 |
+
out["image_masks"] = image_masks
|
304 |
+
out["text_masks"] = text_masks
|
305 |
+
out["text_embeddings"] = text_embeddings
|
306 |
+
out["image_embeddings"] = image_embeddings
|
307 |
+
|
308 |
+
|
309 |
+
|
310 |
+
# -------------------- caption ------------------- #
|
311 |
+
if random.uniform(0, 1) < self.prob_use_caption:
|
312 |
+
if is_det:
|
313 |
+
out["caption"] = make_a_sentence(category_names)
|
314 |
+
else:
|
315 |
+
out["caption"] = raw_item["caption"]
|
316 |
+
else:
|
317 |
+
out["caption"] = ""
|
318 |
+
|
319 |
+
return out
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
def __len__(self):
|
324 |
+
return len(self.tsv_file)
|
325 |
+
|
326 |
+
|
dataset/utils.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
#
|
3 |
+
# Copyright 2018 Google LLC
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
|
17 |
+
import PIL
|
18 |
+
import torch
|
19 |
+
import torchvision.transforms as T
|
20 |
+
|
21 |
+
|
22 |
+
IMAGENET_MEAN = [0.485, 0.456, 0.406]
|
23 |
+
IMAGENET_STD = [0.229, 0.224, 0.225]
|
24 |
+
|
25 |
+
INV_IMAGENET_MEAN = [-m for m in IMAGENET_MEAN]
|
26 |
+
INV_IMAGENET_STD = [1.0 / s for s in IMAGENET_STD]
|
27 |
+
|
28 |
+
|
29 |
+
def imagenet_preprocess():
|
30 |
+
return T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
31 |
+
|
32 |
+
|
33 |
+
def rescale(x):
|
34 |
+
lo, hi = x.min(), x.max()
|
35 |
+
return x.sub(lo).div(hi - lo)
|
36 |
+
|
37 |
+
|
38 |
+
def imagenet_deprocess(rescale_image=True):
|
39 |
+
transforms = [
|
40 |
+
T.Normalize(mean=[0, 0, 0], std=INV_IMAGENET_STD),
|
41 |
+
T.Normalize(mean=INV_IMAGENET_MEAN, std=[1.0, 1.0, 1.0]),
|
42 |
+
]
|
43 |
+
if rescale_image:
|
44 |
+
transforms.append(rescale)
|
45 |
+
return T.Compose(transforms)
|
46 |
+
|
47 |
+
|
48 |
+
def imagenet_deprocess_batch(imgs, rescale=True):
|
49 |
+
"""
|
50 |
+
Input:
|
51 |
+
- imgs: FloatTensor of shape (N, C, H, W) giving preprocessed images
|
52 |
+
|
53 |
+
Output:
|
54 |
+
- imgs_de: ByteTensor of shape (N, C, H, W) giving deprocessed images
|
55 |
+
in the range [0, 255]
|
56 |
+
"""
|
57 |
+
if isinstance(imgs, torch.autograd.Variable):
|
58 |
+
imgs = imgs.data
|
59 |
+
imgs = imgs.cpu().clone()
|
60 |
+
deprocess_fn = imagenet_deprocess(rescale_image=rescale)
|
61 |
+
imgs_de = []
|
62 |
+
for i in range(imgs.size(0)):
|
63 |
+
img_de = deprocess_fn(imgs[i])[None]
|
64 |
+
img_de = img_de.mul(255).clamp(0, 255).byte()
|
65 |
+
imgs_de.append(img_de)
|
66 |
+
imgs_de = torch.cat(imgs_de, dim=0)
|
67 |
+
return imgs_de
|
68 |
+
|
69 |
+
|
70 |
+
class Resize(object):
|
71 |
+
def __init__(self, size, interp=PIL.Image.BILINEAR):
|
72 |
+
if isinstance(size, tuple):
|
73 |
+
H, W = size
|
74 |
+
self.size = (W, H)
|
75 |
+
else:
|
76 |
+
self.size = (size, size)
|
77 |
+
self.interp = interp
|
78 |
+
|
79 |
+
def __call__(self, img):
|
80 |
+
return img.resize(self.size, self.interp)
|
81 |
+
|
82 |
+
|
83 |
+
def unpack_var(v):
|
84 |
+
if isinstance(v, torch.autograd.Variable):
|
85 |
+
return v.data
|
86 |
+
return v
|
87 |
+
|
88 |
+
|
89 |
+
def split_graph_batch(triples, obj_data, obj_to_img, triple_to_img):
|
90 |
+
triples = unpack_var(triples)
|
91 |
+
obj_data = [unpack_var(o) for o in obj_data]
|
92 |
+
obj_to_img = unpack_var(obj_to_img)
|
93 |
+
triple_to_img = unpack_var(triple_to_img)
|
94 |
+
|
95 |
+
triples_out = []
|
96 |
+
obj_data_out = [[] for _ in obj_data]
|
97 |
+
obj_offset = 0
|
98 |
+
N = obj_to_img.max() + 1
|
99 |
+
for i in range(N):
|
100 |
+
o_idxs = (obj_to_img == i).nonzero().view(-1)
|
101 |
+
t_idxs = (triple_to_img == i).nonzero().view(-1)
|
102 |
+
|
103 |
+
cur_triples = triples[t_idxs].clone()
|
104 |
+
cur_triples[:, 0] -= obj_offset
|
105 |
+
cur_triples[:, 2] -= obj_offset
|
106 |
+
triples_out.append(cur_triples)
|
107 |
+
|
108 |
+
for j, o_data in enumerate(obj_data):
|
109 |
+
cur_o_data = None
|
110 |
+
if o_data is not None:
|
111 |
+
cur_o_data = o_data[o_idxs]
|
112 |
+
obj_data_out[j].append(cur_o_data)
|
113 |
+
|
114 |
+
obj_offset += o_idxs.size(0)
|
115 |
+
|
116 |
+
return triples_out, obj_data_out
|
gligen/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (345 Bytes). View file
|
|
gligen/__pycache__/distributed.cpython-38.pyc
ADDED
Binary file (2.91 kB). View file
|
|
gligen/__pycache__/evaluator.cpython-38.pyc
ADDED
Binary file (5.9 kB). View file
|
|
gligen/__pycache__/task_grounded_generation.cpython-38.pyc
ADDED
Binary file (9.11 kB). View file
|
|
gligen/__pycache__/trainer.cpython-38.pyc
ADDED
Binary file (11.7 kB). View file
|
|
gligen/ldm/__pycache__/util.cpython-38.pyc
ADDED
Binary file (3.2 kB). View file
|
|
gligen/ldm/models/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
gligen/ldm/models/__pycache__/autoencoder.cpython-38.pyc
ADDED
Binary file (1.58 kB). View file
|
|
gligen/ldm/models/autoencoder.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
#import pytorch_lightning as pl
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from contextlib import contextmanager
|
6 |
+
|
7 |
+
# from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
|
8 |
+
|
9 |
+
from ldm.modules.diffusionmodules.model import Encoder, Decoder
|
10 |
+
from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
|
11 |
+
|
12 |
+
from ldm.util import instantiate_from_config
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
class AutoencoderKL(nn.Module):
|
18 |
+
def __init__(self,
|
19 |
+
ddconfig,
|
20 |
+
embed_dim,
|
21 |
+
scale_factor=1
|
22 |
+
):
|
23 |
+
super().__init__()
|
24 |
+
self.encoder = Encoder(**ddconfig)
|
25 |
+
self.decoder = Decoder(**ddconfig)
|
26 |
+
assert ddconfig["double_z"]
|
27 |
+
self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
|
28 |
+
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
29 |
+
self.embed_dim = embed_dim
|
30 |
+
self.scale_factor = scale_factor
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
def encode(self, x):
|
35 |
+
h = self.encoder(x)
|
36 |
+
moments = self.quant_conv(h)
|
37 |
+
posterior = DiagonalGaussianDistribution(moments)
|
38 |
+
return posterior.sample() * self.scale_factor
|
39 |
+
|
40 |
+
def decode(self, z):
|
41 |
+
z = 1. / self.scale_factor * z
|
42 |
+
z = self.post_quant_conv(z)
|
43 |
+
dec = self.decoder(z)
|
44 |
+
return dec
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
gligen/ldm/models/diffusion/__init__.py
ADDED
File without changes
|
gligen/ldm/models/diffusion/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (159 Bytes). View file
|
|
gligen/ldm/models/diffusion/__pycache__/ddim.cpython-38.pyc
ADDED
Binary file (4.57 kB). View file
|
|
gligen/ldm/models/diffusion/__pycache__/ddpm.cpython-38.pyc
ADDED
Binary file (2.12 kB). View file
|
|
gligen/ldm/models/diffusion/__pycache__/gaussian_smoothing.cpython-38.pyc
ADDED
Binary file (4.11 kB). View file
|
|
gligen/ldm/models/diffusion/__pycache__/ldm.cpython-38.pyc
ADDED
Binary file (1.21 kB). View file
|
|
gligen/ldm/models/diffusion/__pycache__/loss.cpython-38.pyc
ADDED
Binary file (4.23 kB). View file
|
|
gligen/ldm/models/diffusion/__pycache__/plms.cpython-38.pyc
ADDED
Binary file (8.71 kB). View file
|
|
gligen/ldm/models/diffusion/classifier.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import pytorch_lightning as pl
|
4 |
+
from omegaconf import OmegaConf
|
5 |
+
from torch.nn import functional as F
|
6 |
+
from torch.optim import AdamW
|
7 |
+
from torch.optim.lr_scheduler import LambdaLR
|
8 |
+
from copy import deepcopy
|
9 |
+
from einops import rearrange
|
10 |
+
from glob import glob
|
11 |
+
from natsort import natsorted
|
12 |
+
|
13 |
+
from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
|
14 |
+
from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
|
15 |
+
|
16 |
+
__models__ = {
|
17 |
+
'class_label': EncoderUNetModel,
|
18 |
+
'segmentation': UNetModel
|
19 |
+
}
|
20 |
+
|
21 |
+
|
22 |
+
def disabled_train(self, mode=True):
|
23 |
+
"""Overwrite model.train with this function to make sure train/eval mode
|
24 |
+
does not change anymore."""
|
25 |
+
return self
|
26 |
+
|
27 |
+
|
28 |
+
class NoisyLatentImageClassifier(pl.LightningModule):
|
29 |
+
|
30 |
+
def __init__(self,
|
31 |
+
diffusion_path,
|
32 |
+
num_classes,
|
33 |
+
ckpt_path=None,
|
34 |
+
pool='attention',
|
35 |
+
label_key=None,
|
36 |
+
diffusion_ckpt_path=None,
|
37 |
+
scheduler_config=None,
|
38 |
+
weight_decay=1.e-2,
|
39 |
+
log_steps=10,
|
40 |
+
monitor='val/loss',
|
41 |
+
*args,
|
42 |
+
**kwargs):
|
43 |
+
super().__init__(*args, **kwargs)
|
44 |
+
self.num_classes = num_classes
|
45 |
+
# get latest config of diffusion model
|
46 |
+
diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
|
47 |
+
self.diffusion_config = OmegaConf.load(diffusion_config).model
|
48 |
+
self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
|
49 |
+
self.load_diffusion()
|
50 |
+
|
51 |
+
self.monitor = monitor
|
52 |
+
self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
|
53 |
+
self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
|
54 |
+
self.log_steps = log_steps
|
55 |
+
|
56 |
+
self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
|
57 |
+
else self.diffusion_model.cond_stage_key
|
58 |
+
|
59 |
+
assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
|
60 |
+
|
61 |
+
if self.label_key not in __models__:
|
62 |
+
raise NotImplementedError()
|
63 |
+
|
64 |
+
self.load_classifier(ckpt_path, pool)
|
65 |
+
|
66 |
+
self.scheduler_config = scheduler_config
|
67 |
+
self.use_scheduler = self.scheduler_config is not None
|
68 |
+
self.weight_decay = weight_decay
|
69 |
+
|
70 |
+
def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
|
71 |
+
sd = torch.load(path, map_location="cpu")
|
72 |
+
if "state_dict" in list(sd.keys()):
|
73 |
+
sd = sd["state_dict"]
|
74 |
+
keys = list(sd.keys())
|
75 |
+
for k in keys:
|
76 |
+
for ik in ignore_keys:
|
77 |
+
if k.startswith(ik):
|
78 |
+
print("Deleting key {} from state_dict.".format(k))
|
79 |
+
del sd[k]
|
80 |
+
missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
|
81 |
+
sd, strict=False)
|
82 |
+
print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
|
83 |
+
if len(missing) > 0:
|
84 |
+
print(f"Missing Keys: {missing}")
|
85 |
+
if len(unexpected) > 0:
|
86 |
+
print(f"Unexpected Keys: {unexpected}")
|
87 |
+
|
88 |
+
def load_diffusion(self):
|
89 |
+
model = instantiate_from_config(self.diffusion_config)
|
90 |
+
self.diffusion_model = model.eval()
|
91 |
+
self.diffusion_model.train = disabled_train
|
92 |
+
for param in self.diffusion_model.parameters():
|
93 |
+
param.requires_grad = False
|
94 |
+
|
95 |
+
def load_classifier(self, ckpt_path, pool):
|
96 |
+
model_config = deepcopy(self.diffusion_config.params.unet_config.params)
|
97 |
+
model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
|
98 |
+
model_config.out_channels = self.num_classes
|
99 |
+
if self.label_key == 'class_label':
|
100 |
+
model_config.pool = pool
|
101 |
+
|
102 |
+
self.model = __models__[self.label_key](**model_config)
|
103 |
+
if ckpt_path is not None:
|
104 |
+
print('#####################################################################')
|
105 |
+
print(f'load from ckpt "{ckpt_path}"')
|
106 |
+
print('#####################################################################')
|
107 |
+
self.init_from_ckpt(ckpt_path)
|
108 |
+
|
109 |
+
@torch.no_grad()
|
110 |
+
def get_x_noisy(self, x, t, noise=None):
|
111 |
+
noise = default(noise, lambda: torch.randn_like(x))
|
112 |
+
continuous_sqrt_alpha_cumprod = None
|
113 |
+
if self.diffusion_model.use_continuous_noise:
|
114 |
+
continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
|
115 |
+
# todo: make sure t+1 is correct here
|
116 |
+
|
117 |
+
return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
|
118 |
+
continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
|
119 |
+
|
120 |
+
def forward(self, x_noisy, t, *args, **kwargs):
|
121 |
+
return self.model(x_noisy, t)
|
122 |
+
|
123 |
+
@torch.no_grad()
|
124 |
+
def get_input(self, batch, k):
|
125 |
+
x = batch[k]
|
126 |
+
if len(x.shape) == 3:
|
127 |
+
x = x[..., None]
|
128 |
+
x = rearrange(x, 'b h w c -> b c h w')
|
129 |
+
x = x.to(memory_format=torch.contiguous_format).float()
|
130 |
+
return x
|
131 |
+
|
132 |
+
@torch.no_grad()
|
133 |
+
def get_conditioning(self, batch, k=None):
|
134 |
+
if k is None:
|
135 |
+
k = self.label_key
|
136 |
+
assert k is not None, 'Needs to provide label key'
|
137 |
+
|
138 |
+
targets = batch[k].to(self.device)
|
139 |
+
|
140 |
+
if self.label_key == 'segmentation':
|
141 |
+
targets = rearrange(targets, 'b h w c -> b c h w')
|
142 |
+
for down in range(self.numd):
|
143 |
+
h, w = targets.shape[-2:]
|
144 |
+
targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
|
145 |
+
|
146 |
+
# targets = rearrange(targets,'b c h w -> b h w c')
|
147 |
+
|
148 |
+
return targets
|
149 |
+
|
150 |
+
def compute_top_k(self, logits, labels, k, reduction="mean"):
|
151 |
+
_, top_ks = torch.topk(logits, k, dim=1)
|
152 |
+
if reduction == "mean":
|
153 |
+
return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
|
154 |
+
elif reduction == "none":
|
155 |
+
return (top_ks == labels[:, None]).float().sum(dim=-1)
|
156 |
+
|
157 |
+
def on_train_epoch_start(self):
|
158 |
+
# save some memory
|
159 |
+
self.diffusion_model.model.to('cpu')
|
160 |
+
|
161 |
+
@torch.no_grad()
|
162 |
+
def write_logs(self, loss, logits, targets):
|
163 |
+
log_prefix = 'train' if self.training else 'val'
|
164 |
+
log = {}
|
165 |
+
log[f"{log_prefix}/loss"] = loss.mean()
|
166 |
+
log[f"{log_prefix}/acc@1"] = self.compute_top_k(
|
167 |
+
logits, targets, k=1, reduction="mean"
|
168 |
+
)
|
169 |
+
log[f"{log_prefix}/acc@5"] = self.compute_top_k(
|
170 |
+
logits, targets, k=5, reduction="mean"
|
171 |
+
)
|
172 |
+
|
173 |
+
self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
|
174 |
+
self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
|
175 |
+
self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
|
176 |
+
lr = self.optimizers().param_groups[0]['lr']
|
177 |
+
self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
|
178 |
+
|
179 |
+
def shared_step(self, batch, t=None):
|
180 |
+
x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
|
181 |
+
targets = self.get_conditioning(batch)
|
182 |
+
if targets.dim() == 4:
|
183 |
+
targets = targets.argmax(dim=1)
|
184 |
+
if t is None:
|
185 |
+
t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
|
186 |
+
else:
|
187 |
+
t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
|
188 |
+
x_noisy = self.get_x_noisy(x, t)
|
189 |
+
logits = self(x_noisy, t)
|
190 |
+
|
191 |
+
loss = F.cross_entropy(logits, targets, reduction='none')
|
192 |
+
|
193 |
+
self.write_logs(loss.detach(), logits.detach(), targets.detach())
|
194 |
+
|
195 |
+
loss = loss.mean()
|
196 |
+
return loss, logits, x_noisy, targets
|
197 |
+
|
198 |
+
def training_step(self, batch, batch_idx):
|
199 |
+
loss, *_ = self.shared_step(batch)
|
200 |
+
return loss
|
201 |
+
|
202 |
+
def reset_noise_accs(self):
|
203 |
+
self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
|
204 |
+
range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
|
205 |
+
|
206 |
+
def on_validation_start(self):
|
207 |
+
self.reset_noise_accs()
|
208 |
+
|
209 |
+
@torch.no_grad()
|
210 |
+
def validation_step(self, batch, batch_idx):
|
211 |
+
loss, *_ = self.shared_step(batch)
|
212 |
+
|
213 |
+
for t in self.noisy_acc:
|
214 |
+
_, logits, _, targets = self.shared_step(batch, t)
|
215 |
+
self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
|
216 |
+
self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
|
217 |
+
|
218 |
+
return loss
|
219 |
+
|
220 |
+
def configure_optimizers(self):
|
221 |
+
optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
|
222 |
+
|
223 |
+
if self.use_scheduler:
|
224 |
+
scheduler = instantiate_from_config(self.scheduler_config)
|
225 |
+
|
226 |
+
print("Setting up LambdaLR scheduler...")
|
227 |
+
scheduler = [
|
228 |
+
{
|
229 |
+
'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
|
230 |
+
'interval': 'step',
|
231 |
+
'frequency': 1
|
232 |
+
}]
|
233 |
+
return [optimizer], scheduler
|
234 |
+
|
235 |
+
return optimizer
|
236 |
+
|
237 |
+
@torch.no_grad()
|
238 |
+
def log_images(self, batch, N=8, *args, **kwargs):
|
239 |
+
log = dict()
|
240 |
+
x = self.get_input(batch, self.diffusion_model.first_stage_key)
|
241 |
+
log['inputs'] = x
|
242 |
+
|
243 |
+
y = self.get_conditioning(batch)
|
244 |
+
|
245 |
+
if self.label_key == 'class_label':
|
246 |
+
y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
|
247 |
+
log['labels'] = y
|
248 |
+
|
249 |
+
if ismap(y):
|
250 |
+
log['labels'] = self.diffusion_model.to_rgb(y)
|
251 |
+
|
252 |
+
for step in range(self.log_steps):
|
253 |
+
current_time = step * self.log_time_interval
|
254 |
+
|
255 |
+
_, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
|
256 |
+
|
257 |
+
log[f'inputs@t{current_time}'] = x_noisy
|
258 |
+
|
259 |
+
pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
|
260 |
+
pred = rearrange(pred, 'b h w c -> b c h w')
|
261 |
+
|
262 |
+
log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
|
263 |
+
|
264 |
+
for key in log:
|
265 |
+
log[key] = log[key][:N]
|
266 |
+
|
267 |
+
return log
|
gligen/ldm/models/diffusion/ddim.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from tqdm import tqdm
|
4 |
+
from functools import partial
|
5 |
+
|
6 |
+
from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
|
7 |
+
|
8 |
+
|
9 |
+
class DDIMSampler(object):
|
10 |
+
def __init__(self, diffusion, model, schedule="linear", alpha_generator_func=None, set_alpha_scale=None):
|
11 |
+
super().__init__()
|
12 |
+
self.diffusion = diffusion
|
13 |
+
self.model = model
|
14 |
+
self.device = diffusion.betas.device
|
15 |
+
self.ddpm_num_timesteps = diffusion.num_timesteps
|
16 |
+
self.schedule = schedule
|
17 |
+
self.alpha_generator_func = alpha_generator_func
|
18 |
+
self.set_alpha_scale = set_alpha_scale
|
19 |
+
|
20 |
+
|
21 |
+
def register_buffer(self, name, attr):
|
22 |
+
if type(attr) == torch.Tensor:
|
23 |
+
attr = attr.to(self.device)
|
24 |
+
setattr(self, name, attr)
|
25 |
+
|
26 |
+
|
27 |
+
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.):
|
28 |
+
self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
|
29 |
+
num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=False)
|
30 |
+
alphas_cumprod = self.diffusion.alphas_cumprod
|
31 |
+
assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
|
32 |
+
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.device)
|
33 |
+
|
34 |
+
self.register_buffer('betas', to_torch(self.diffusion.betas))
|
35 |
+
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
36 |
+
self.register_buffer('alphas_cumprod_prev', to_torch(self.diffusion.alphas_cumprod_prev))
|
37 |
+
|
38 |
+
# calculations for diffusion q(x_t | x_{t-1}) and others
|
39 |
+
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
|
40 |
+
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
|
41 |
+
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
|
42 |
+
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
|
43 |
+
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
|
44 |
+
|
45 |
+
# ddim sampling parameters
|
46 |
+
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
|
47 |
+
ddim_timesteps=self.ddim_timesteps,
|
48 |
+
eta=ddim_eta,verbose=False)
|
49 |
+
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
50 |
+
self.register_buffer('ddim_alphas', ddim_alphas)
|
51 |
+
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
52 |
+
self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
|
53 |
+
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
54 |
+
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
|
55 |
+
1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
56 |
+
self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
|
57 |
+
|
58 |
+
|
59 |
+
@torch.no_grad()
|
60 |
+
def sample(self, S, shape, input, uc=None, guidance_scale=1, mask=None, x0=None):
|
61 |
+
self.make_schedule(ddim_num_steps=S)
|
62 |
+
return self.ddim_sampling(shape, input, uc, guidance_scale, mask=mask, x0=x0)
|
63 |
+
|
64 |
+
|
65 |
+
@torch.no_grad()
|
66 |
+
def ddim_sampling(self, shape, input, uc, guidance_scale=1, mask=None, x0=None):
|
67 |
+
b = shape[0]
|
68 |
+
|
69 |
+
img = input["x"]
|
70 |
+
if img == None:
|
71 |
+
img = torch.randn(shape, device=self.device)
|
72 |
+
input["x"] = img
|
73 |
+
|
74 |
+
|
75 |
+
time_range = np.flip(self.ddim_timesteps)
|
76 |
+
total_steps = self.ddim_timesteps.shape[0]
|
77 |
+
|
78 |
+
#iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
|
79 |
+
iterator = time_range
|
80 |
+
|
81 |
+
if self.alpha_generator_func != None:
|
82 |
+
alphas = self.alpha_generator_func(len(iterator))
|
83 |
+
|
84 |
+
|
85 |
+
for i, step in enumerate(iterator):
|
86 |
+
|
87 |
+
# set alpha
|
88 |
+
if self.alpha_generator_func != None:
|
89 |
+
self.set_alpha_scale(self.model, alphas[i])
|
90 |
+
if alphas[i] == 0:
|
91 |
+
self.model.restore_first_conv_from_SD()
|
92 |
+
|
93 |
+
# run
|
94 |
+
index = total_steps - i - 1
|
95 |
+
input["timesteps"] = torch.full((b,), step, device=self.device, dtype=torch.long)
|
96 |
+
|
97 |
+
if mask is not None:
|
98 |
+
assert x0 is not None
|
99 |
+
img_orig = self.diffusion.q_sample( x0, input["timesteps"] )
|
100 |
+
img = img_orig * mask + (1. - mask) * img
|
101 |
+
input["x"] = img
|
102 |
+
|
103 |
+
img, pred_x0 = self.p_sample_ddim(input, index=index, uc=uc, guidance_scale=guidance_scale)
|
104 |
+
input["x"] = img
|
105 |
+
|
106 |
+
return img
|
107 |
+
|
108 |
+
|
109 |
+
@torch.no_grad()
|
110 |
+
def p_sample_ddim(self, input, index, uc=None, guidance_scale=1):
|
111 |
+
|
112 |
+
|
113 |
+
e_t = self.model(input)
|
114 |
+
if uc is not None and guidance_scale != 1:
|
115 |
+
unconditional_input = dict(x=input["x"], timesteps=input["timesteps"], context=uc, inpainting_extra_input=input["inpainting_extra_input"], grounding_extra_input=input['grounding_extra_input'])
|
116 |
+
e_t_uncond = self.model( unconditional_input )
|
117 |
+
e_t = e_t_uncond + guidance_scale * (e_t - e_t_uncond)
|
118 |
+
|
119 |
+
# select parameters corresponding to the currently considered timestep
|
120 |
+
b = input["x"].shape[0]
|
121 |
+
a_t = torch.full((b, 1, 1, 1), self.ddim_alphas[index], device=self.device)
|
122 |
+
a_prev = torch.full((b, 1, 1, 1), self.ddim_alphas_prev[index], device=self.device)
|
123 |
+
sigma_t = torch.full((b, 1, 1, 1), self.ddim_sigmas[index], device=self.device)
|
124 |
+
sqrt_one_minus_at = torch.full((b, 1, 1, 1), self.ddim_sqrt_one_minus_alphas[index],device=self.device)
|
125 |
+
|
126 |
+
# current prediction for x_0
|
127 |
+
pred_x0 = (input["x"] - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
128 |
+
|
129 |
+
# direction pointing to x_t
|
130 |
+
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
|
131 |
+
noise = sigma_t * torch.randn_like( input["x"] )
|
132 |
+
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
133 |
+
|
134 |
+
return x_prev, pred_x0
|
gligen/ldm/models/diffusion/ddpm.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import numpy as np
|
4 |
+
from functools import partial
|
5 |
+
from ldm.modules.diffusionmodules.util import make_beta_schedule
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
class DDPM(nn.Module):
|
12 |
+
def __init__(self, beta_schedule="linear", timesteps=1000, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
|
13 |
+
super().__init__()
|
14 |
+
|
15 |
+
self.v_posterior = 0
|
16 |
+
self.register_schedule(beta_schedule, timesteps, linear_start, linear_end, cosine_s)
|
17 |
+
|
18 |
+
|
19 |
+
def register_schedule(self, beta_schedule="linear", timesteps=1000, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
|
20 |
+
|
21 |
+
betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
|
22 |
+
alphas = 1. - betas
|
23 |
+
alphas_cumprod = np.cumprod(alphas, axis=0)
|
24 |
+
alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
|
25 |
+
|
26 |
+
timesteps, = betas.shape
|
27 |
+
self.num_timesteps = int(timesteps)
|
28 |
+
self.linear_start = linear_start
|
29 |
+
self.linear_end = linear_end
|
30 |
+
assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
|
31 |
+
|
32 |
+
to_torch = partial(torch.tensor, dtype=torch.float32)
|
33 |
+
|
34 |
+
self.register_buffer('betas', to_torch(betas))
|
35 |
+
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
36 |
+
self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
|
37 |
+
|
38 |
+
# calculations for diffusion q(x_t | x_{t-1}) and others
|
39 |
+
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
|
40 |
+
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
|
41 |
+
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
|
42 |
+
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
|
43 |
+
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
|
44 |
+
|
45 |
+
# calculations for posterior q(x_{t-1} | x_t, x_0)
|
46 |
+
posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / ( 1. - alphas_cumprod) + self.v_posterior * betas
|
47 |
+
# above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
|
48 |
+
|
49 |
+
self.register_buffer('posterior_variance', to_torch(posterior_variance))
|
50 |
+
|
51 |
+
# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
|
52 |
+
self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
|
53 |
+
self.register_buffer('posterior_mean_coef1', to_torch( betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
|
54 |
+
self.register_buffer('posterior_mean_coef2', to_torch( (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
gligen/ldm/models/diffusion/gaussian_smoothing.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numbers
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
|
8 |
+
class GaussianSmoothing(nn.Module):
|
9 |
+
"""
|
10 |
+
Apply gaussian smoothing on a
|
11 |
+
1d, 2d or 3d tensor. Filtering is performed seperately for each channel
|
12 |
+
in the input using a depthwise convolution.
|
13 |
+
Arguments:
|
14 |
+
channels (int, sequence): Number of channels of the input tensors. Output will
|
15 |
+
have this number of channels as well.
|
16 |
+
kernel_size (int, sequence): Size of the gaussian kernel.
|
17 |
+
sigma (float, sequence): Standard deviation of the gaussian kernel.
|
18 |
+
dim (int, optional): The number of dimensions of the data.
|
19 |
+
Default value is 2 (spatial).
|
20 |
+
"""
|
21 |
+
def __init__(self, channels, kernel_size, sigma, dim=2):
|
22 |
+
super(GaussianSmoothing, self).__init__()
|
23 |
+
if isinstance(kernel_size, numbers.Number):
|
24 |
+
kernel_size = [kernel_size] * dim
|
25 |
+
if isinstance(sigma, numbers.Number):
|
26 |
+
sigma = [sigma] * dim
|
27 |
+
|
28 |
+
# The gaussian kernel is the product of the
|
29 |
+
# gaussian function of each dimension.
|
30 |
+
kernel = 1
|
31 |
+
meshgrids = torch.meshgrid(
|
32 |
+
[
|
33 |
+
torch.arange(size, dtype=torch.float32)
|
34 |
+
for size in kernel_size
|
35 |
+
]
|
36 |
+
)
|
37 |
+
for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
|
38 |
+
mean = (size - 1) / 2
|
39 |
+
kernel *= 1 / (std * math.sqrt(2 * math.pi)) * \
|
40 |
+
torch.exp(-((mgrid - mean) / (2 * std)) ** 2)
|
41 |
+
|
42 |
+
# Make sure sum of values in gaussian kernel equals 1.
|
43 |
+
kernel = kernel / torch.sum(kernel)
|
44 |
+
|
45 |
+
# Reshape to depthwise convolutional weight
|
46 |
+
kernel = kernel.view(1, 1, *kernel.size())
|
47 |
+
kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
|
48 |
+
|
49 |
+
self.register_buffer('weight', kernel)
|
50 |
+
self.groups = channels
|
51 |
+
|
52 |
+
if dim == 1:
|
53 |
+
self.conv = F.conv1d
|
54 |
+
elif dim == 2:
|
55 |
+
self.conv = F.conv2d
|
56 |
+
elif dim == 3:
|
57 |
+
self.conv = F.conv3d
|
58 |
+
else:
|
59 |
+
raise RuntimeError(
|
60 |
+
'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(dim)
|
61 |
+
)
|
62 |
+
|
63 |
+
def forward(self, input):
|
64 |
+
"""
|
65 |
+
Apply gaussian filter to input.
|
66 |
+
Arguments:
|
67 |
+
input (torch.Tensor): Input to apply gaussian filter on.
|
68 |
+
Returns:
|
69 |
+
filtered (torch.Tensor): Filtered output.
|
70 |
+
"""
|
71 |
+
return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups)
|
72 |
+
|
73 |
+
|
74 |
+
class AverageSmoothing(nn.Module):
|
75 |
+
"""
|
76 |
+
Apply average smoothing on a
|
77 |
+
1d, 2d or 3d tensor. Filtering is performed seperately for each channel
|
78 |
+
in the input using a depthwise convolution.
|
79 |
+
Arguments:
|
80 |
+
channels (int, sequence): Number of channels of the input tensors. Output will
|
81 |
+
have this number of channels as well.
|
82 |
+
kernel_size (int, sequence): Size of the average kernel.
|
83 |
+
sigma (float, sequence): Standard deviation of the rage kernel.
|
84 |
+
dim (int, optional): The number of dimensions of the data.
|
85 |
+
Default value is 2 (spatial).
|
86 |
+
"""
|
87 |
+
def __init__(self, channels, kernel_size, dim=2):
|
88 |
+
super(AverageSmoothing, self).__init__()
|
89 |
+
|
90 |
+
# Make sure sum of values in gaussian kernel equals 1.
|
91 |
+
kernel = torch.ones(size=(kernel_size, kernel_size)) / (kernel_size * kernel_size)
|
92 |
+
|
93 |
+
# Reshape to depthwise convolutional weight
|
94 |
+
kernel = kernel.view(1, 1, *kernel.size())
|
95 |
+
kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
|
96 |
+
|
97 |
+
self.register_buffer('weight', kernel)
|
98 |
+
self.groups = channels
|
99 |
+
|
100 |
+
if dim == 1:
|
101 |
+
self.conv = F.conv1d
|
102 |
+
elif dim == 2:
|
103 |
+
self.conv = F.conv2d
|
104 |
+
elif dim == 3:
|
105 |
+
self.conv = F.conv3d
|
106 |
+
else:
|
107 |
+
raise RuntimeError(
|
108 |
+
'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(dim)
|
109 |
+
)
|
110 |
+
|
111 |
+
def forward(self, input):
|
112 |
+
"""
|
113 |
+
Apply average filter to input.
|
114 |
+
Arguments:
|
115 |
+
input (torch.Tensor): Input to apply average filter on.
|
116 |
+
Returns:
|
117 |
+
filtered (torch.Tensor): Filtered output.
|
118 |
+
"""
|
119 |
+
return self.conv(input, weight=self.weight, groups=self.groups)
|
gligen/ldm/models/diffusion/ldm.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import numpy as np
|
4 |
+
from tqdm import tqdm
|
5 |
+
from ldm.util import default
|
6 |
+
from ldm.modules.diffusionmodules.util import extract_into_tensor
|
7 |
+
from .ddpm import DDPM
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
class LatentDiffusion(DDPM):
|
12 |
+
def __init__(self, *args, **kwargs):
|
13 |
+
super().__init__(*args, **kwargs)
|
14 |
+
# hardcoded
|
15 |
+
self.clip_denoised = False
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
def q_sample(self, x_start, t, noise=None):
|
20 |
+
noise = default(noise, lambda: torch.randn_like(x_start))
|
21 |
+
return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
|
22 |
+
extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
|
23 |
+
|
24 |
+
|
25 |
+
"Does not support DDPM sampling anymore. Only do DDIM or PLMS"
|
26 |
+
|
27 |
+
# = = = = = = = = = = = = Below is for sampling = = = = = = = = = = = = #
|
28 |
+
|
29 |
+
# def predict_start_from_noise(self, x_t, t, noise):
|
30 |
+
# return ( extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
|
31 |
+
# extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise )
|
32 |
+
|
33 |
+
# def q_posterior(self, x_start, x_t, t):
|
34 |
+
# posterior_mean = (
|
35 |
+
# extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
|
36 |
+
# extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
|
37 |
+
# )
|
38 |
+
# posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
|
39 |
+
# posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
|
40 |
+
# return posterior_mean, posterior_variance, posterior_log_variance_clipped
|
41 |
+
|
42 |
+
|
43 |
+
# def p_mean_variance(self, model, x, c, t):
|
44 |
+
|
45 |
+
# model_out = model(x, t, c)
|
46 |
+
# x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
|
47 |
+
|
48 |
+
# if self.clip_denoised:
|
49 |
+
# x_recon.clamp_(-1., 1.)
|
50 |
+
|
51 |
+
# model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
|
52 |
+
# return model_mean, posterior_variance, posterior_log_variance, x_recon
|
53 |
+
|
54 |
+
|
55 |
+
# @torch.no_grad()
|
56 |
+
# def p_sample(self, model, x, c, t):
|
57 |
+
# b, *_, device = *x.shape, x.device
|
58 |
+
# model_mean, _, model_log_variance, x0 = self.p_mean_variance(model, x=x, c=c, t=t, )
|
59 |
+
# noise = torch.randn_like(x)
|
60 |
+
|
61 |
+
# # no noise when t == 0
|
62 |
+
# nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
|
63 |
+
|
64 |
+
# return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
|
65 |
+
|
66 |
+
|
67 |
+
# @torch.no_grad()
|
68 |
+
# def p_sample_loop(self, model, shape, c):
|
69 |
+
# device = self.betas.device
|
70 |
+
# b = shape[0]
|
71 |
+
# img = torch.randn(shape, device=device)
|
72 |
+
|
73 |
+
# iterator = tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps)
|
74 |
+
# for i in iterator:
|
75 |
+
# ts = torch.full((b,), i, device=device, dtype=torch.long)
|
76 |
+
# img, x0 = self.p_sample(model, img, c, ts)
|
77 |
+
|
78 |
+
# return img
|
79 |
+
|
80 |
+
|
81 |
+
# @torch.no_grad()
|
82 |
+
# def sample(self, model, shape, c, uc=None, guidance_scale=None):
|
83 |
+
# return self.p_sample_loop(model, shape, c)
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
|
gligen/ldm/models/diffusion/loss.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from ldm.models.diffusion.gaussian_smoothing import GaussianSmoothing
|
4 |
+
from torch.nn import functional as F
|
5 |
+
from torchvision.utils import save_image
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def loss_one_att_outside(attn_map,bboxes, object_positions,t):
|
13 |
+
# loss = torch.tensor(0).to('cuda')
|
14 |
+
loss = 0
|
15 |
+
object_number = len(bboxes)
|
16 |
+
b, i, j = attn_map.shape
|
17 |
+
H = W = int(math.sqrt(i))
|
18 |
+
|
19 |
+
|
20 |
+
# if t== 20: import pdb; pdb.set_trace()
|
21 |
+
|
22 |
+
for obj_idx in range(object_number):
|
23 |
+
|
24 |
+
for obj_box in bboxes[obj_idx]:
|
25 |
+
mask = torch.zeros(size=(H, W)).cuda() if torch.cuda.is_available() else torch.zeros(size=(H, W))
|
26 |
+
x_min, y_min, x_max, y_max = int(obj_box[0] * W), \
|
27 |
+
int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
|
28 |
+
mask[y_min: y_max, x_min: x_max] = 1.
|
29 |
+
mask_out = 1. - mask
|
30 |
+
index = (mask == 1.).nonzero(as_tuple=False)
|
31 |
+
index_in_key = index[:,0]* H + index[:, 1]
|
32 |
+
att_box = torch.zeros_like(attn_map)
|
33 |
+
att_box[:,index_in_key,:] = attn_map[:,index_in_key,:]
|
34 |
+
|
35 |
+
att_box = att_box.sum(axis=1) / index_in_key.shape[0]
|
36 |
+
att_box = att_box.reshape(-1, H, H)
|
37 |
+
activation_value = (att_box* mask_out).reshape(b, -1).sum(dim=-1) #/ att_box.reshape(b, -1).sum(dim=-1)
|
38 |
+
loss += torch.mean(activation_value)
|
39 |
+
|
40 |
+
return loss / object_number
|
41 |
+
|
42 |
+
def caculate_loss_self_att(self_first, self_second, self_third, bboxes, object_positions, t, list_res=[256], smooth_att = True,sigma=0.5,kernel_size=3 ):
|
43 |
+
all_attn = get_all_self_att(self_first, self_second, self_third)
|
44 |
+
cnt = 0
|
45 |
+
total_loss = 0
|
46 |
+
for res in list_res:
|
47 |
+
attn_maps = all_attn[res]
|
48 |
+
for attn in attn_maps:
|
49 |
+
total_loss += loss_one_att_outside(attn, bboxes, object_positions,t)
|
50 |
+
cnt += 1
|
51 |
+
|
52 |
+
return total_loss /cnt
|
53 |
+
|
54 |
+
|
55 |
+
def get_all_self_att(self_first, self_second, self_third):
|
56 |
+
result = {256:[], 1024:[], 4096:[], 64:[], 94:[],1054:[] ,286:[],4126:[] }
|
57 |
+
# import pdb; pdb.set_trace()
|
58 |
+
all_att = [self_first, self_second, self_third]
|
59 |
+
for self_att in all_att:
|
60 |
+
for att in self_att:
|
61 |
+
if att != []:
|
62 |
+
temp = att[0]
|
63 |
+
for attn_map in temp:
|
64 |
+
current_res = attn_map.shape[1]
|
65 |
+
# print(current_res)
|
66 |
+
result[current_res].append(attn_map)
|
67 |
+
return result
|
68 |
+
|
69 |
+
def get_all_attention(attn_maps_mid, attn_maps_up , attn_maps_down, res):
|
70 |
+
result = []
|
71 |
+
|
72 |
+
for attn_map_integrated in attn_maps_up:
|
73 |
+
if attn_map_integrated == []: continue
|
74 |
+
attn_map = attn_map_integrated[0][0]
|
75 |
+
b, i, j = attn_map.shape
|
76 |
+
H = W = int(math.sqrt(i))
|
77 |
+
# print(H)
|
78 |
+
if H == res:
|
79 |
+
result.append(attn_map.reshape(-1, res, res,attn_map.shape[-1] ))
|
80 |
+
for attn_map_integrated in attn_maps_mid:
|
81 |
+
|
82 |
+
# for attn_map_integrated in attn_maps_mid:
|
83 |
+
attn_map = attn_map_integrated[0]
|
84 |
+
b, i, j = attn_map.shape
|
85 |
+
H = W = int(math.sqrt(i))
|
86 |
+
# print(H)
|
87 |
+
if (H==res):
|
88 |
+
result.append(attn_map.reshape(-1, res, res,attn_map.shape[-1] ))
|
89 |
+
# import pdb; pdb.set_trace()
|
90 |
+
for attn_map_integrated in attn_maps_down:
|
91 |
+
if attn_map_integrated == []: continue
|
92 |
+
attn_map = attn_map_integrated[0][0]
|
93 |
+
if attn_map == []: continue
|
94 |
+
b, i, j = attn_map.shape
|
95 |
+
H = W = int(math.sqrt(i))
|
96 |
+
# print(H)
|
97 |
+
if (H==res):
|
98 |
+
result.append(attn_map.reshape(-1, res, res,attn_map.shape[-1] ))
|
99 |
+
|
100 |
+
result = torch.cat(result, dim=0)
|
101 |
+
result = result.sum(0) / result.shape[0]
|
102 |
+
return result
|
103 |
+
|
104 |
+
|
105 |
+
def caculate_loss_att_fixed_cnt(attn_maps_mid, attn_maps_up, attn_maps_down, bboxes, object_positions, t, res=16, smooth_att = True,sigma=0.5,kernel_size=3 ):
|
106 |
+
attn16 = get_all_attention(attn_maps_mid, attn_maps_up, attn_maps_down, res)
|
107 |
+
# attn32 = get_all_attention(attn_maps_mid, attn_maps_up, attn_maps_down, 32)
|
108 |
+
# attn64 = get_all_attention(attn_maps_mid, attn_maps_up, attn_maps_down, 64)
|
109 |
+
# attn8 = get_all_attention(attn_maps_mid, attn_maps_up, attn_maps_down, 8)
|
110 |
+
all_attn = [attn16]
|
111 |
+
obj_number = len(bboxes)
|
112 |
+
total_loss = 0
|
113 |
+
# import pdb; pdb.set_trace()
|
114 |
+
for attn in all_attn[0:1]:
|
115 |
+
attn_text = attn[:, :, 1:-1]
|
116 |
+
attn_text *= 100
|
117 |
+
attn_text = torch.nn.functional.softmax(attn_text, dim=-1)
|
118 |
+
current_res = attn.shape[0]
|
119 |
+
H = W = current_res
|
120 |
+
|
121 |
+
# if t == 49: import pdb; pdb.set_trace()
|
122 |
+
for obj_idx in range(obj_number):
|
123 |
+
num_boxes= 0
|
124 |
+
|
125 |
+
for obj_position in object_positions[obj_idx]:
|
126 |
+
true_obj_position = obj_position - 1
|
127 |
+
att_map_obj = attn_text[:,:, true_obj_position]
|
128 |
+
if smooth_att:
|
129 |
+
smoothing = GaussianSmoothing(channels=1, kernel_size=kernel_size, sigma=sigma, dim=2).cuda()
|
130 |
+
input = F.pad(att_map_obj.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode='reflect')
|
131 |
+
att_map_obj = smoothing(input).squeeze(0).squeeze(0)
|
132 |
+
other_att_map_obj = att_map_obj.clone()
|
133 |
+
att_copy = att_map_obj.clone()
|
134 |
+
|
135 |
+
for obj_box in bboxes[obj_idx]:
|
136 |
+
x_min, y_min, x_max, y_max = int(obj_box[0] * W), \
|
137 |
+
int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
|
138 |
+
|
139 |
+
|
140 |
+
if att_map_obj[y_min: y_max, x_min: x_max].numel() == 0:
|
141 |
+
max_inside=1.
|
142 |
+
|
143 |
+
else:
|
144 |
+
max_inside = att_map_obj[y_min: y_max, x_min: x_max].max()
|
145 |
+
total_loss += 1. - max_inside
|
146 |
+
|
147 |
+
# find max outside the box, find in the other boxes
|
148 |
+
|
149 |
+
att_copy[y_min: y_max, x_min: x_max] = 0.
|
150 |
+
other_att_map_obj[y_min: y_max, x_min: x_max] = 0.
|
151 |
+
|
152 |
+
for obj_outside in range(obj_number):
|
153 |
+
if obj_outside != obj_idx:
|
154 |
+
for obj_out_box in bboxes[obj_outside]:
|
155 |
+
x_min_out, y_min_out, x_max_out, y_max_out = int(obj_out_box[0] * W), \
|
156 |
+
int(obj_out_box[1] * H), int(obj_out_box[2] * W), int(obj_out_box[3] * H)
|
157 |
+
|
158 |
+
# att_copy[y_min: y_max, x_min: x_max] = 0.
|
159 |
+
if other_att_map_obj[y_min_out: y_max_out, x_min_out: x_max_out].numel() == 0:
|
160 |
+
max_outside_one= 0
|
161 |
+
else:
|
162 |
+
max_outside_one = other_att_map_obj[y_min_out: y_max_out, x_min_out: x_max_out].max()
|
163 |
+
# max_outside = max(max_outside,max_outside_one )
|
164 |
+
att_copy[y_min_out: y_max_out, x_min_out: x_max_out] = 0.
|
165 |
+
total_loss += max_outside_one
|
166 |
+
max_background = att_copy.max()
|
167 |
+
total_loss += len(bboxes[obj_idx]) *max_background /2.
|
168 |
+
|
169 |
+
return total_loss/obj_number
|
170 |
+
|
gligen/ldm/models/diffusion/plms.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from tqdm import tqdm
|
4 |
+
from functools import partial
|
5 |
+
from copy import deepcopy
|
6 |
+
from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
|
7 |
+
import math
|
8 |
+
from ldm.models.diffusion.loss import caculate_loss_att_fixed_cnt, caculate_loss_self_att
|
9 |
+
class PLMSSampler(object):
|
10 |
+
def __init__(self, diffusion, model, schedule="linear", alpha_generator_func=None, set_alpha_scale=None):
|
11 |
+
super().__init__()
|
12 |
+
self.diffusion = diffusion
|
13 |
+
self.model = model
|
14 |
+
self.device = diffusion.betas.device
|
15 |
+
self.ddpm_num_timesteps = diffusion.num_timesteps
|
16 |
+
self.schedule = schedule
|
17 |
+
self.alpha_generator_func = alpha_generator_func
|
18 |
+
self.set_alpha_scale = set_alpha_scale
|
19 |
+
|
20 |
+
def register_buffer(self, name, attr):
|
21 |
+
if type(attr) == torch.Tensor:
|
22 |
+
attr = attr.to(self.device)
|
23 |
+
setattr(self, name, attr)
|
24 |
+
|
25 |
+
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=False):
|
26 |
+
if ddim_eta != 0:
|
27 |
+
raise ValueError('ddim_eta must be 0 for PLMS')
|
28 |
+
self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
|
29 |
+
num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
|
30 |
+
alphas_cumprod = self.diffusion.alphas_cumprod
|
31 |
+
assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
|
32 |
+
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.device)
|
33 |
+
|
34 |
+
self.register_buffer('betas', to_torch(self.diffusion.betas))
|
35 |
+
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
36 |
+
self.register_buffer('alphas_cumprod_prev', to_torch(self.diffusion.alphas_cumprod_prev))
|
37 |
+
|
38 |
+
# calculations for diffusion q(x_t | x_{t-1}) and others
|
39 |
+
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
|
40 |
+
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
|
41 |
+
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
|
42 |
+
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
|
43 |
+
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
|
44 |
+
|
45 |
+
# ddim sampling parameters
|
46 |
+
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
|
47 |
+
ddim_timesteps=self.ddim_timesteps,
|
48 |
+
eta=ddim_eta,verbose=verbose)
|
49 |
+
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
50 |
+
self.register_buffer('ddim_alphas', ddim_alphas)
|
51 |
+
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
52 |
+
self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
|
53 |
+
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
54 |
+
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
|
55 |
+
1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
56 |
+
self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
|
57 |
+
|
58 |
+
|
59 |
+
# @torch.no_grad()
|
60 |
+
def sample(self, S, shape, input, uc=None, guidance_scale=1, mask=None, x0=None, loss_type='SAR_CAR'):
|
61 |
+
self.make_schedule(ddim_num_steps=S)
|
62 |
+
# import pdb; pdb.set_trace()
|
63 |
+
return self.plms_sampling(shape, input, uc, guidance_scale, mask=mask, x0=x0, loss_type=loss_type)
|
64 |
+
|
65 |
+
|
66 |
+
# @torch.no_grad()
|
67 |
+
def plms_sampling(self, shape, input, uc=None, guidance_scale=1, mask=None, x0=None, loss_type='SAR_CAR'):
|
68 |
+
|
69 |
+
b = shape[0]
|
70 |
+
|
71 |
+
img = input["x"]
|
72 |
+
if img == None:
|
73 |
+
img = torch.randn(shape, device=self.device)
|
74 |
+
input["x"] = img
|
75 |
+
|
76 |
+
time_range = np.flip(self.ddim_timesteps)
|
77 |
+
total_steps = self.ddim_timesteps.shape[0]
|
78 |
+
|
79 |
+
old_eps = []
|
80 |
+
|
81 |
+
if self.alpha_generator_func != None:
|
82 |
+
alphas = self.alpha_generator_func(len(time_range))
|
83 |
+
|
84 |
+
for i, step in enumerate(time_range):
|
85 |
+
|
86 |
+
# set alpha and restore first conv layer
|
87 |
+
if self.alpha_generator_func != None:
|
88 |
+
self.set_alpha_scale(self.model, alphas[i])
|
89 |
+
if alphas[i] == 0:
|
90 |
+
self.model.restore_first_conv_from_SD()
|
91 |
+
|
92 |
+
# run
|
93 |
+
index = total_steps - i - 1
|
94 |
+
ts = torch.full((b,), step, device=self.device, dtype=torch.long)
|
95 |
+
ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=self.device, dtype=torch.long)
|
96 |
+
|
97 |
+
if mask is not None:
|
98 |
+
assert x0 is not None
|
99 |
+
img_orig = self.diffusion.q_sample(x0, ts)
|
100 |
+
img = img_orig * mask + (1. - mask) * img
|
101 |
+
input["x"] = img
|
102 |
+
# three loss types
|
103 |
+
if loss_type !=None and loss_type!='standard':
|
104 |
+
if input['object_position'] != []:
|
105 |
+
if loss_type=='SAR_CAR':
|
106 |
+
x = self.update_loss_self_cross( input,i, index, ts )
|
107 |
+
elif loss_type=='SAR':
|
108 |
+
x = self.update_only_self( input,i, index, ts )
|
109 |
+
elif loss_type=='CAR':
|
110 |
+
x = self.update_loss_only_cross( input,i, index, ts )
|
111 |
+
input["x"] = x
|
112 |
+
img, pred_x0, e_t = self.p_sample_plms(input, ts, index=index, uc=uc, guidance_scale=guidance_scale, old_eps=old_eps, t_next=ts_next)
|
113 |
+
input["x"] = img
|
114 |
+
old_eps.append(e_t)
|
115 |
+
if len(old_eps) >= 4:
|
116 |
+
old_eps.pop(0)
|
117 |
+
|
118 |
+
return img
|
119 |
+
|
120 |
+
def update_loss_self_cross(self, input,index1, index, ts,type_loss='self_accross' ):
|
121 |
+
if index1 < 10:
|
122 |
+
loss_scale = 3
|
123 |
+
max_iter = 5
|
124 |
+
elif index1 < 20:
|
125 |
+
loss_scale = 2
|
126 |
+
max_iter = 5
|
127 |
+
else:
|
128 |
+
loss_scale = 0.8
|
129 |
+
max_iter = 1
|
130 |
+
|
131 |
+
loss_threshold = 0.1
|
132 |
+
max_index = 20
|
133 |
+
x = deepcopy(input["x"])
|
134 |
+
iteration = 0
|
135 |
+
loss = torch.tensor(10000)
|
136 |
+
input["timesteps"] = ts
|
137 |
+
|
138 |
+
print("optimize", index1)
|
139 |
+
self.model.train()
|
140 |
+
while loss.item() > loss_threshold and iteration < max_iter and (index1 < max_index) :
|
141 |
+
print('iter', iteration)
|
142 |
+
# import pdb; pdb.set_trace()
|
143 |
+
x = x.requires_grad_(True)
|
144 |
+
input['x'] = x
|
145 |
+
e_t, att_first, att_second, att_third, self_first, self_second, self_third = self.model(input)
|
146 |
+
bboxes = input['boxes_att']
|
147 |
+
object_positions = input['object_position']
|
148 |
+
loss1 = caculate_loss_self_att(self_first, self_second, self_third, bboxes=bboxes,
|
149 |
+
object_positions=object_positions, t = index1)*loss_scale
|
150 |
+
loss2 = caculate_loss_att_fixed_cnt(att_second,att_first,att_third, bboxes=bboxes,
|
151 |
+
object_positions=object_positions, t = index1)*loss_scale
|
152 |
+
loss = loss1 + loss2
|
153 |
+
print('loss', loss, loss1, loss2)
|
154 |
+
hh = torch.autograd.backward(loss, retain_graph=True)
|
155 |
+
grad_cond = x.grad
|
156 |
+
x = x - grad_cond
|
157 |
+
x = x.detach()
|
158 |
+
iteration += 1
|
159 |
+
torch.cuda.empty_cache()
|
160 |
+
return x
|
161 |
+
|
162 |
+
def update_loss_only_cross(self, input,index1, index, ts,type_loss='self_accross'):
|
163 |
+
|
164 |
+
if index1 < 10:
|
165 |
+
loss_scale = 3
|
166 |
+
max_iter = 5
|
167 |
+
elif index1 < 20:
|
168 |
+
loss_scale = 2
|
169 |
+
max_iter = 5
|
170 |
+
else:
|
171 |
+
loss_scale = 1
|
172 |
+
max_iter = 1
|
173 |
+
loss_threshold = 0.1
|
174 |
+
|
175 |
+
max_index = 30
|
176 |
+
x = deepcopy(input["x"])
|
177 |
+
iteration = 0
|
178 |
+
loss = torch.tensor(10000)
|
179 |
+
input["timesteps"] = ts
|
180 |
+
|
181 |
+
print("optimize", index1)
|
182 |
+
while loss.item() > loss_threshold and iteration < max_iter and (index1 < max_index) :
|
183 |
+
print('iter', iteration)
|
184 |
+
x = x.requires_grad_(True)
|
185 |
+
input['x'] = x
|
186 |
+
e_t, att_first, att_second, att_third, self_first, self_second, self_third = self.model(input)
|
187 |
+
|
188 |
+
bboxes = input['boxes']
|
189 |
+
object_positions = input['object_position']
|
190 |
+
loss2 = caculate_loss_att_fixed_cnt(att_second,att_first,att_third, bboxes=bboxes,
|
191 |
+
object_positions=object_positions, t = index1)*loss_scale
|
192 |
+
loss = loss2
|
193 |
+
print('loss', loss)
|
194 |
+
hh = torch.autograd.backward(loss)
|
195 |
+
grad_cond = x.grad
|
196 |
+
x = x - grad_cond
|
197 |
+
x = x.detach()
|
198 |
+
iteration += 1
|
199 |
+
torch.cuda.empty_cache()
|
200 |
+
return x
|
201 |
+
|
202 |
+
def update_only_self(self, input,index1, index, ts,type_loss='self_accross' ):
|
203 |
+
if index1 < 10:
|
204 |
+
loss_scale = 4
|
205 |
+
max_iter = 5
|
206 |
+
elif index1 < 20:
|
207 |
+
loss_scale = 3
|
208 |
+
max_iter = 5
|
209 |
+
else:
|
210 |
+
loss_scale = 1
|
211 |
+
max_iter = 1
|
212 |
+
loss_threshold = 0.1
|
213 |
+
|
214 |
+
max_index = 30
|
215 |
+
x = deepcopy(input["x"])
|
216 |
+
iteration = 0
|
217 |
+
loss = torch.tensor(10000)
|
218 |
+
input["timesteps"] = ts
|
219 |
+
|
220 |
+
print("optimize", index1)
|
221 |
+
while loss.item() > loss_threshold and iteration < max_iter and (index1 < max_index) :
|
222 |
+
print('iter', iteration)
|
223 |
+
x = x.requires_grad_(True)
|
224 |
+
input['x'] = x
|
225 |
+
e_t, att_first, att_second, att_third, self_first, self_second, self_third = self.model(input)
|
226 |
+
|
227 |
+
bboxes = input['boxes']
|
228 |
+
object_positions = input['object_position']
|
229 |
+
loss = caculate_loss_self_att(self_first, self_second, self_third, bboxes=bboxes,
|
230 |
+
object_positions=object_positions, t = index1)*loss_scale
|
231 |
+
print('loss', loss)
|
232 |
+
hh = torch.autograd.backward(loss)
|
233 |
+
grad_cond = x.grad
|
234 |
+
|
235 |
+
x = x - grad_cond
|
236 |
+
x = x.detach()
|
237 |
+
iteration += 1
|
238 |
+
torch.cuda.empty_cache()
|
239 |
+
return x
|
240 |
+
|
241 |
+
@torch.no_grad()
|
242 |
+
def p_sample_plms(self, input, t, index, guidance_scale=1., uc=None, old_eps=None, t_next=None):
|
243 |
+
x = deepcopy(input["x"])
|
244 |
+
b = x.shape[0]
|
245 |
+
self.model.eval()
|
246 |
+
def get_model_output(input):
|
247 |
+
e_t, first, second, third,_,_,_ = self.model(input)
|
248 |
+
if uc is not None and guidance_scale != 1:
|
249 |
+
unconditional_input = dict(x=input["x"], timesteps=input["timesteps"], context=uc, inpainting_extra_input=None, grounding_extra_input=None)
|
250 |
+
# unconditional_input=input
|
251 |
+
e_t_uncond, _, _, _, _, _, _ = self.model( unconditional_input)
|
252 |
+
e_t = e_t_uncond + guidance_scale * (e_t - e_t_uncond)
|
253 |
+
return e_t
|
254 |
+
|
255 |
+
|
256 |
+
def get_x_prev_and_pred_x0(e_t, index):
|
257 |
+
# select parameters corresponding to the currently considered timestep
|
258 |
+
a_t = torch.full((b, 1, 1, 1), self.ddim_alphas[index], device=self.device)
|
259 |
+
a_prev = torch.full((b, 1, 1, 1), self.ddim_alphas_prev[index], device=self.device)
|
260 |
+
sigma_t = torch.full((b, 1, 1, 1), self.ddim_sigmas[index], device=self.device)
|
261 |
+
sqrt_one_minus_at = torch.full((b, 1, 1, 1), self.ddim_sqrt_one_minus_alphas[index],device=self.device)
|
262 |
+
|
263 |
+
# current prediction for x_0
|
264 |
+
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
265 |
+
|
266 |
+
# direction pointing to x_t
|
267 |
+
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
|
268 |
+
noise = sigma_t * torch.randn_like(x)
|
269 |
+
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
270 |
+
return x_prev, pred_x0
|
271 |
+
|
272 |
+
input["timesteps"] = t
|
273 |
+
e_t = get_model_output(input)
|
274 |
+
if len(old_eps) == 0:
|
275 |
+
# Pseudo Improved Euler (2nd order)
|
276 |
+
x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
|
277 |
+
input["x"] = x_prev
|
278 |
+
input["timesteps"] = t_next
|
279 |
+
e_t_next = get_model_output(input)
|
280 |
+
e_t_prime = (e_t + e_t_next) / 2
|
281 |
+
elif len(old_eps) == 1:
|
282 |
+
# 2nd order Pseudo Linear Multistep (Adams-Bashforth)
|
283 |
+
e_t_prime = (3 * e_t - old_eps[-1]) / 2
|
284 |
+
elif len(old_eps) == 2:
|
285 |
+
# 3nd order Pseudo Linear Multistep (Adams-Bashforth)
|
286 |
+
e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
|
287 |
+
elif len(old_eps) >= 3:
|
288 |
+
# 4nd order Pseudo Linear Multistep (Adams-Bashforth)
|
289 |
+
e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
|
290 |
+
|
291 |
+
x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
|
292 |
+
|
293 |
+
return x_prev, pred_x0, e_t
|
294 |
+
|
295 |
+
|
gligen/ldm/modules/__pycache__/attention.cpython-38.pyc
ADDED
Binary file (13 kB). View file
|
|
gligen/ldm/modules/__pycache__/x_transformer.cpython-38.pyc
ADDED
Binary file (18.3 kB). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (188 Bytes). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (167 Bytes). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/convnext.cpython-38.pyc
ADDED
Binary file (8.43 kB). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/model.cpython-38.pyc
ADDED
Binary file (20.7 kB). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/normal_grounding_net.cpython-38.pyc
ADDED
Binary file (1.94 kB). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/openaimodel.cpython-38.pyc
ADDED
Binary file (13 kB). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/text_grounding_net.cpython-38.pyc
ADDED
Binary file (1.66 kB). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-37.pyc
ADDED
Binary file (10.1 kB). View file
|
|
gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-38.pyc
ADDED
Binary file (10.2 kB). View file
|
|