Spaces:
Runtime error
Runtime error
Delete models/uvit.py
Browse files- models/uvit.py +0 -310
models/uvit.py
DELETED
@@ -1,310 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn as nn
|
3 |
-
import math
|
4 |
-
import timm
|
5 |
-
from timm.models.layers import trunc_normal_
|
6 |
-
from timm.models.vision_transformer import PatchEmbed, Mlp
|
7 |
-
# assert timm.__version__ == "0.3.2" # version checks
|
8 |
-
import einops
|
9 |
-
import torch.utils.checkpoint
|
10 |
-
|
11 |
-
# the xformers lib allows less memory, faster training and inference
|
12 |
-
try:
|
13 |
-
import xformers
|
14 |
-
import xformers.ops
|
15 |
-
except:
|
16 |
-
XFORMERS_IS_AVAILBLE = False
|
17 |
-
# print('xformers disabled')
|
18 |
-
|
19 |
-
|
20 |
-
def timestep_embedding(timesteps, dim, max_period=10000):
|
21 |
-
"""
|
22 |
-
Create sinusoidal timestep embeddings.
|
23 |
-
|
24 |
-
:param timesteps: a 1-D Tensor of N indices, one per batch element.
|
25 |
-
These may be fractional.
|
26 |
-
:param dim: the dimension of the output.
|
27 |
-
:param max_period: controls the minimum frequency of the embeddings.
|
28 |
-
:return: an [N x dim] Tensor of positional embeddings.
|
29 |
-
"""
|
30 |
-
half = dim // 2
|
31 |
-
freqs = torch.exp(
|
32 |
-
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
|
33 |
-
).to(device=timesteps.device)
|
34 |
-
args = timesteps[:, None].float() * freqs[None]
|
35 |
-
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
36 |
-
if dim % 2:
|
37 |
-
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
38 |
-
return embedding
|
39 |
-
|
40 |
-
|
41 |
-
def patchify(imgs, patch_size):
|
42 |
-
x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size)
|
43 |
-
return x
|
44 |
-
|
45 |
-
|
46 |
-
def unpatchify(x, channels=3):
|
47 |
-
patch_size = int((x.shape[2] // channels) ** 0.5)
|
48 |
-
h = w = int(x.shape[1] ** .5)
|
49 |
-
assert h * w == x.shape[1] and patch_size ** 2 * channels == x.shape[2]
|
50 |
-
x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h, p1=patch_size, p2=patch_size)
|
51 |
-
return x
|
52 |
-
|
53 |
-
|
54 |
-
class Attention(nn.Module):
|
55 |
-
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
|
56 |
-
super().__init__()
|
57 |
-
self.num_heads = num_heads
|
58 |
-
head_dim = dim // num_heads
|
59 |
-
self.scale = qk_scale or head_dim ** -0.5
|
60 |
-
|
61 |
-
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
62 |
-
self.attn_drop = nn.Dropout(attn_drop)
|
63 |
-
self.proj = nn.Linear(dim, dim)
|
64 |
-
self.proj_drop = nn.Dropout(proj_drop)
|
65 |
-
|
66 |
-
def forward(self, x):
|
67 |
-
B, L, C = x.shape
|
68 |
-
|
69 |
-
qkv = self.qkv(x)
|
70 |
-
if XFORMERS_IS_AVAILBLE: # the xformers lib allows less memory, faster training and inference
|
71 |
-
qkv = einops.rearrange(qkv, 'B L (K H D) -> K B L H D', K=3, H=self.num_heads)
|
72 |
-
q, k, v = qkv[0], qkv[1], qkv[2] # B L H D
|
73 |
-
x = xformers.ops.memory_efficient_attention(q, k, v)
|
74 |
-
x = einops.rearrange(x, 'B L H D -> B L (H D)', H=self.num_heads)
|
75 |
-
else:
|
76 |
-
qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads)
|
77 |
-
q, k, v = qkv[0], qkv[1], qkv[2] # B H L D
|
78 |
-
attn = (q @ k.transpose(-2, -1)) * self.scale
|
79 |
-
attn = attn.softmax(dim=-1)
|
80 |
-
attn = self.attn_drop(attn)
|
81 |
-
x = (attn @ v).transpose(1, 2).reshape(B, L, C)
|
82 |
-
|
83 |
-
x = self.proj(x)
|
84 |
-
x = self.proj_drop(x)
|
85 |
-
return x
|
86 |
-
|
87 |
-
|
88 |
-
class Block(nn.Module):
|
89 |
-
|
90 |
-
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
|
91 |
-
act_layer=nn.GELU, norm_layer=nn.LayerNorm, skip=False, use_checkpoint=False):
|
92 |
-
super().__init__()
|
93 |
-
self.norm1 = norm_layer(dim)
|
94 |
-
self.attn = Attention(
|
95 |
-
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale)
|
96 |
-
self.norm2 = norm_layer(dim)
|
97 |
-
mlp_hidden_dim = int(dim * mlp_ratio)
|
98 |
-
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
|
99 |
-
self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
|
100 |
-
self.use_checkpoint = use_checkpoint
|
101 |
-
|
102 |
-
def forward(self, x, skip=None):
|
103 |
-
if self.use_checkpoint:
|
104 |
-
return torch.utils.checkpoint.checkpoint(self._forward, x, skip)
|
105 |
-
else:
|
106 |
-
return self._forward(x, skip)
|
107 |
-
|
108 |
-
def _forward(self, x, skip=None):
|
109 |
-
if self.skip_linear is not None:
|
110 |
-
# print('x shape', x.shape)
|
111 |
-
# print('skip shape', skip.shape)
|
112 |
-
# exit()
|
113 |
-
x = self.skip_linear(torch.cat([x, skip], dim=-1))
|
114 |
-
x = x + self.attn(self.norm1(x))
|
115 |
-
x = x + self.mlp(self.norm2(x))
|
116 |
-
return x
|
117 |
-
|
118 |
-
|
119 |
-
class UViT(nn.Module):
|
120 |
-
def __init__(self, input_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.,
|
121 |
-
qkv_bias=False, qk_scale=None, norm_layer=nn.LayerNorm, mlp_time_embed=False, num_classes=-1,
|
122 |
-
use_checkpoint=False, conv=True, skip=True, num_frames=16, class_guided=False, use_lora=False):
|
123 |
-
super().__init__()
|
124 |
-
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
|
125 |
-
self.num_classes = num_classes
|
126 |
-
self.in_chans = in_chans
|
127 |
-
|
128 |
-
self.patch_embed = PatchEmbed(
|
129 |
-
img_size=input_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
|
130 |
-
num_patches = self.patch_embed.num_patches
|
131 |
-
|
132 |
-
self.time_embed = nn.Sequential(
|
133 |
-
nn.Linear(embed_dim, 4 * embed_dim),
|
134 |
-
nn.SiLU(),
|
135 |
-
nn.Linear(4 * embed_dim, embed_dim),
|
136 |
-
) if mlp_time_embed else nn.Identity()
|
137 |
-
|
138 |
-
if self.num_classes > 0:
|
139 |
-
self.label_emb = nn.Embedding(self.num_classes, embed_dim)
|
140 |
-
self.extras = 2
|
141 |
-
else:
|
142 |
-
self.extras = 1
|
143 |
-
|
144 |
-
self.pos_embed = nn.Parameter(torch.zeros(1, self.extras + num_patches, embed_dim))
|
145 |
-
self.frame_embed = nn.Parameter(torch.zeros(1, num_frames, embed_dim))
|
146 |
-
|
147 |
-
self.in_blocks = nn.ModuleList([
|
148 |
-
Block(
|
149 |
-
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
150 |
-
norm_layer=norm_layer, use_checkpoint=use_checkpoint)
|
151 |
-
for _ in range(depth // 2)])
|
152 |
-
|
153 |
-
self.mid_block = Block(
|
154 |
-
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
155 |
-
norm_layer=norm_layer, use_checkpoint=use_checkpoint)
|
156 |
-
|
157 |
-
self.out_blocks = nn.ModuleList([
|
158 |
-
Block(
|
159 |
-
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
|
160 |
-
norm_layer=norm_layer, skip=skip, use_checkpoint=use_checkpoint)
|
161 |
-
for _ in range(depth // 2)])
|
162 |
-
|
163 |
-
self.norm = norm_layer(embed_dim)
|
164 |
-
self.patch_dim = patch_size ** 2 * in_chans
|
165 |
-
self.decoder_pred = nn.Linear(embed_dim, self.patch_dim, bias=True)
|
166 |
-
self.final_layer = nn.Conv2d(self.in_chans, self.in_chans * 2, 3, padding=1) if conv else nn.Identity()
|
167 |
-
|
168 |
-
trunc_normal_(self.pos_embed, std=.02)
|
169 |
-
trunc_normal_(self.frame_embed, std=.02)
|
170 |
-
self.apply(self._init_weights)
|
171 |
-
|
172 |
-
def _init_weights(self, m):
|
173 |
-
if isinstance(m, nn.Linear):
|
174 |
-
trunc_normal_(m.weight, std=.02)
|
175 |
-
if isinstance(m, nn.Linear) and m.bias is not None:
|
176 |
-
nn.init.constant_(m.bias, 0)
|
177 |
-
elif isinstance(m, nn.LayerNorm):
|
178 |
-
nn.init.constant_(m.bias, 0)
|
179 |
-
nn.init.constant_(m.weight, 1.0)
|
180 |
-
|
181 |
-
@torch.jit.ignore
|
182 |
-
def no_weight_decay(self):
|
183 |
-
return {'pos_embed'}
|
184 |
-
|
185 |
-
def forward_(self, x, timesteps, y=None):
|
186 |
-
x = self.patch_embed(x) # 48, 256, 1152
|
187 |
-
# print(x.shape)
|
188 |
-
B, L, D = x.shape
|
189 |
-
|
190 |
-
time_token = self.time_embed(timestep_embedding(timesteps, self.embed_dim)) # 3, 1152
|
191 |
-
# print(time_token.shape)
|
192 |
-
time_token = time_token.unsqueeze(dim=1) # 3, 1, 1152
|
193 |
-
x = torch.cat((time_token, x), dim=1)
|
194 |
-
|
195 |
-
if y is not None:
|
196 |
-
label_emb = self.label_emb(y)
|
197 |
-
label_emb = label_emb.unsqueeze(dim=1)
|
198 |
-
x = torch.cat((label_emb, x), dim=1)
|
199 |
-
x = x + self.pos_embed
|
200 |
-
|
201 |
-
skips = []
|
202 |
-
for blk in self.in_blocks:
|
203 |
-
x = blk(x)
|
204 |
-
skips.append(x)
|
205 |
-
|
206 |
-
x = self.mid_block(x)
|
207 |
-
|
208 |
-
for blk in self.out_blocks:
|
209 |
-
x = blk(x, skips.pop())
|
210 |
-
|
211 |
-
x = self.norm(x)
|
212 |
-
x = self.decoder_pred(x)
|
213 |
-
assert x.size(1) == self.extras + L
|
214 |
-
x = x[:, self.extras:, :]
|
215 |
-
x = unpatchify(x, self.in_chans)
|
216 |
-
x = self.final_layer(x)
|
217 |
-
return x
|
218 |
-
|
219 |
-
def forward(self, x, timesteps, y=None):
|
220 |
-
# print(x.shape)
|
221 |
-
batch, frame, _, _, _ = x.shape
|
222 |
-
# θΏιrearrangeεζ―ιfζ―εδΈδΈͺθ§ι’
|
223 |
-
x = einops.rearrange(x, 'b f c h w -> (b f) c h w') # 3 16 4 256 256
|
224 |
-
x = self.patch_embed(x) # 48, 256, 1152
|
225 |
-
B, L, D = x.shape
|
226 |
-
|
227 |
-
time_token = self.time_embed(timestep_embedding(timesteps, self.embed_dim)) # 3, 1152
|
228 |
-
# timestep_spatialηrepeatιθ¦δΏθ―ζ―fεΈ§δΈΊεδΈδΈͺtimesteps
|
229 |
-
time_token_spatial = einops.repeat(time_token, 'n d -> (n c) d', c=frame) # 48, 1152
|
230 |
-
time_token_spatial = time_token_spatial.unsqueeze(dim=1) # 48, 1, 1152
|
231 |
-
x = torch.cat((time_token_spatial, x), dim=1) # 48, 257, 1152
|
232 |
-
|
233 |
-
if y is not None:
|
234 |
-
label_emb = self.label_emb(y)
|
235 |
-
label_emb = label_emb.unsqueeze(dim=1)
|
236 |
-
x = torch.cat((label_emb, x), dim=1)
|
237 |
-
x = x + self.pos_embed
|
238 |
-
|
239 |
-
skips = []
|
240 |
-
for i in range(0, len(self.in_blocks), 2):
|
241 |
-
# print('The {}-th run'.format(i))
|
242 |
-
spatial_block, time_block = self.in_blocks[i:i+2]
|
243 |
-
x = spatial_block(x)
|
244 |
-
|
245 |
-
# add time embeddings and conduct attention as frame.
|
246 |
-
x = einops.rearrange(x, '(b f) t d -> (b t) f d', b=batch) # t 代葨εεΈ§tokenζ°; 771, 16, 1152; 771: 3 * 257
|
247 |
-
skips.append(x)
|
248 |
-
# print(x.shape)
|
249 |
-
|
250 |
-
if i == 0:
|
251 |
-
x = x + self.frame_embed # 771, 16, 1152
|
252 |
-
|
253 |
-
x = time_block(x)
|
254 |
-
|
255 |
-
x = einops.rearrange(x, '(b t) f d -> (b f) t d', b=batch) # 48, 257, 1152
|
256 |
-
skips.append(x)
|
257 |
-
|
258 |
-
x = self.mid_block(x)
|
259 |
-
|
260 |
-
for i in range(0, len(self.out_blocks), 2):
|
261 |
-
# print('The {}-th run'.format(i))
|
262 |
-
spatial_block, time_block = self.out_blocks[i:i+2]
|
263 |
-
x = spatial_block(x, skips.pop())
|
264 |
-
|
265 |
-
# add time embeddings and conduct attention as frame.
|
266 |
-
x = einops.rearrange(x, '(b f) t d -> (b t) f d', b=batch) # t 代葨εεΈ§tokenζ°; 771, 16, 1152; 771: 3 * 257
|
267 |
-
|
268 |
-
x = time_block(x, skips.pop())
|
269 |
-
|
270 |
-
x = einops.rearrange(x, '(b t) f d -> (b f) t d', b=batch) # 48, 256, 1152
|
271 |
-
|
272 |
-
|
273 |
-
x = self.norm(x)
|
274 |
-
x = self.decoder_pred(x)
|
275 |
-
assert x.size(1) == self.extras + L
|
276 |
-
x = x[:, self.extras:, :]
|
277 |
-
x = unpatchify(x, self.in_chans)
|
278 |
-
x = self.final_layer(x)
|
279 |
-
x = einops.rearrange(x, '(b f) c h w -> b f c h w', b=batch)
|
280 |
-
# print(x.shape)
|
281 |
-
return x
|
282 |
-
|
283 |
-
def UViT_XL_2(**kwargs):
|
284 |
-
return UViT(patch_size=2, in_chans=4, embed_dim=1152, depth=28,
|
285 |
-
num_heads=16, mlp_ratio=4, qkv_bias=False, mlp_time_embed=4,
|
286 |
-
use_checkpoint=True, conv=False, **kwargs)
|
287 |
-
|
288 |
-
def UViT_L_2(**kwargs):
|
289 |
-
return UViT(patch_size=2, in_chans=4, embed_dim=1024, depth=20,
|
290 |
-
num_heads=16, mlp_ratio=4, qkv_bias=False, mlp_time_embed=False,
|
291 |
-
use_checkpoint=True, **kwargs)
|
292 |
-
|
293 |
-
# 沑ζLδ»₯δΈηοΌUViTδΈLδ»₯δΈηimg_sizeδΈΊ64
|
294 |
-
|
295 |
-
UViT_models = {
|
296 |
-
'UViT-XL/2': UViT_XL_2, 'UViT-L/2': UViT_L_2
|
297 |
-
}
|
298 |
-
|
299 |
-
|
300 |
-
if __name__ == '__main__':
|
301 |
-
|
302 |
-
|
303 |
-
nnet = UViT_XL_2().cuda()
|
304 |
-
|
305 |
-
imgs = torch.randn(3, 16, 4, 32, 32).cuda()
|
306 |
-
timestpes = torch.tensor([1, 2, 3]).cuda()
|
307 |
-
|
308 |
-
outputs = nnet(imgs, timestpes)
|
309 |
-
print(outputs.shape)
|
310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|