maxin-cn commited on
Commit
235f463
β€’
1 Parent(s): f63632a

Delete models/uvit.py

Browse files
Files changed (1) hide show
  1. models/uvit.py +0 -310
models/uvit.py DELETED
@@ -1,310 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import math
4
- import timm
5
- from timm.models.layers import trunc_normal_
6
- from timm.models.vision_transformer import PatchEmbed, Mlp
7
- # assert timm.__version__ == "0.3.2" # version checks
8
- import einops
9
- import torch.utils.checkpoint
10
-
11
- # the xformers lib allows less memory, faster training and inference
12
- try:
13
- import xformers
14
- import xformers.ops
15
- except:
16
- XFORMERS_IS_AVAILBLE = False
17
- # print('xformers disabled')
18
-
19
-
20
- def timestep_embedding(timesteps, dim, max_period=10000):
21
- """
22
- Create sinusoidal timestep embeddings.
23
-
24
- :param timesteps: a 1-D Tensor of N indices, one per batch element.
25
- These may be fractional.
26
- :param dim: the dimension of the output.
27
- :param max_period: controls the minimum frequency of the embeddings.
28
- :return: an [N x dim] Tensor of positional embeddings.
29
- """
30
- half = dim // 2
31
- freqs = torch.exp(
32
- -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
33
- ).to(device=timesteps.device)
34
- args = timesteps[:, None].float() * freqs[None]
35
- embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
36
- if dim % 2:
37
- embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
38
- return embedding
39
-
40
-
41
- def patchify(imgs, patch_size):
42
- x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size)
43
- return x
44
-
45
-
46
- def unpatchify(x, channels=3):
47
- patch_size = int((x.shape[2] // channels) ** 0.5)
48
- h = w = int(x.shape[1] ** .5)
49
- assert h * w == x.shape[1] and patch_size ** 2 * channels == x.shape[2]
50
- x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h, p1=patch_size, p2=patch_size)
51
- return x
52
-
53
-
54
- class Attention(nn.Module):
55
- def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
56
- super().__init__()
57
- self.num_heads = num_heads
58
- head_dim = dim // num_heads
59
- self.scale = qk_scale or head_dim ** -0.5
60
-
61
- self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
62
- self.attn_drop = nn.Dropout(attn_drop)
63
- self.proj = nn.Linear(dim, dim)
64
- self.proj_drop = nn.Dropout(proj_drop)
65
-
66
- def forward(self, x):
67
- B, L, C = x.shape
68
-
69
- qkv = self.qkv(x)
70
- if XFORMERS_IS_AVAILBLE: # the xformers lib allows less memory, faster training and inference
71
- qkv = einops.rearrange(qkv, 'B L (K H D) -> K B L H D', K=3, H=self.num_heads)
72
- q, k, v = qkv[0], qkv[1], qkv[2] # B L H D
73
- x = xformers.ops.memory_efficient_attention(q, k, v)
74
- x = einops.rearrange(x, 'B L H D -> B L (H D)', H=self.num_heads)
75
- else:
76
- qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads)
77
- q, k, v = qkv[0], qkv[1], qkv[2] # B H L D
78
- attn = (q @ k.transpose(-2, -1)) * self.scale
79
- attn = attn.softmax(dim=-1)
80
- attn = self.attn_drop(attn)
81
- x = (attn @ v).transpose(1, 2).reshape(B, L, C)
82
-
83
- x = self.proj(x)
84
- x = self.proj_drop(x)
85
- return x
86
-
87
-
88
- class Block(nn.Module):
89
-
90
- def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
91
- act_layer=nn.GELU, norm_layer=nn.LayerNorm, skip=False, use_checkpoint=False):
92
- super().__init__()
93
- self.norm1 = norm_layer(dim)
94
- self.attn = Attention(
95
- dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale)
96
- self.norm2 = norm_layer(dim)
97
- mlp_hidden_dim = int(dim * mlp_ratio)
98
- self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer)
99
- self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
100
- self.use_checkpoint = use_checkpoint
101
-
102
- def forward(self, x, skip=None):
103
- if self.use_checkpoint:
104
- return torch.utils.checkpoint.checkpoint(self._forward, x, skip)
105
- else:
106
- return self._forward(x, skip)
107
-
108
- def _forward(self, x, skip=None):
109
- if self.skip_linear is not None:
110
- # print('x shape', x.shape)
111
- # print('skip shape', skip.shape)
112
- # exit()
113
- x = self.skip_linear(torch.cat([x, skip], dim=-1))
114
- x = x + self.attn(self.norm1(x))
115
- x = x + self.mlp(self.norm2(x))
116
- return x
117
-
118
-
119
- class UViT(nn.Module):
120
- def __init__(self, input_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.,
121
- qkv_bias=False, qk_scale=None, norm_layer=nn.LayerNorm, mlp_time_embed=False, num_classes=-1,
122
- use_checkpoint=False, conv=True, skip=True, num_frames=16, class_guided=False, use_lora=False):
123
- super().__init__()
124
- self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
125
- self.num_classes = num_classes
126
- self.in_chans = in_chans
127
-
128
- self.patch_embed = PatchEmbed(
129
- img_size=input_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
130
- num_patches = self.patch_embed.num_patches
131
-
132
- self.time_embed = nn.Sequential(
133
- nn.Linear(embed_dim, 4 * embed_dim),
134
- nn.SiLU(),
135
- nn.Linear(4 * embed_dim, embed_dim),
136
- ) if mlp_time_embed else nn.Identity()
137
-
138
- if self.num_classes > 0:
139
- self.label_emb = nn.Embedding(self.num_classes, embed_dim)
140
- self.extras = 2
141
- else:
142
- self.extras = 1
143
-
144
- self.pos_embed = nn.Parameter(torch.zeros(1, self.extras + num_patches, embed_dim))
145
- self.frame_embed = nn.Parameter(torch.zeros(1, num_frames, embed_dim))
146
-
147
- self.in_blocks = nn.ModuleList([
148
- Block(
149
- dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
150
- norm_layer=norm_layer, use_checkpoint=use_checkpoint)
151
- for _ in range(depth // 2)])
152
-
153
- self.mid_block = Block(
154
- dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
155
- norm_layer=norm_layer, use_checkpoint=use_checkpoint)
156
-
157
- self.out_blocks = nn.ModuleList([
158
- Block(
159
- dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
160
- norm_layer=norm_layer, skip=skip, use_checkpoint=use_checkpoint)
161
- for _ in range(depth // 2)])
162
-
163
- self.norm = norm_layer(embed_dim)
164
- self.patch_dim = patch_size ** 2 * in_chans
165
- self.decoder_pred = nn.Linear(embed_dim, self.patch_dim, bias=True)
166
- self.final_layer = nn.Conv2d(self.in_chans, self.in_chans * 2, 3, padding=1) if conv else nn.Identity()
167
-
168
- trunc_normal_(self.pos_embed, std=.02)
169
- trunc_normal_(self.frame_embed, std=.02)
170
- self.apply(self._init_weights)
171
-
172
- def _init_weights(self, m):
173
- if isinstance(m, nn.Linear):
174
- trunc_normal_(m.weight, std=.02)
175
- if isinstance(m, nn.Linear) and m.bias is not None:
176
- nn.init.constant_(m.bias, 0)
177
- elif isinstance(m, nn.LayerNorm):
178
- nn.init.constant_(m.bias, 0)
179
- nn.init.constant_(m.weight, 1.0)
180
-
181
- @torch.jit.ignore
182
- def no_weight_decay(self):
183
- return {'pos_embed'}
184
-
185
- def forward_(self, x, timesteps, y=None):
186
- x = self.patch_embed(x) # 48, 256, 1152
187
- # print(x.shape)
188
- B, L, D = x.shape
189
-
190
- time_token = self.time_embed(timestep_embedding(timesteps, self.embed_dim)) # 3, 1152
191
- # print(time_token.shape)
192
- time_token = time_token.unsqueeze(dim=1) # 3, 1, 1152
193
- x = torch.cat((time_token, x), dim=1)
194
-
195
- if y is not None:
196
- label_emb = self.label_emb(y)
197
- label_emb = label_emb.unsqueeze(dim=1)
198
- x = torch.cat((label_emb, x), dim=1)
199
- x = x + self.pos_embed
200
-
201
- skips = []
202
- for blk in self.in_blocks:
203
- x = blk(x)
204
- skips.append(x)
205
-
206
- x = self.mid_block(x)
207
-
208
- for blk in self.out_blocks:
209
- x = blk(x, skips.pop())
210
-
211
- x = self.norm(x)
212
- x = self.decoder_pred(x)
213
- assert x.size(1) == self.extras + L
214
- x = x[:, self.extras:, :]
215
- x = unpatchify(x, self.in_chans)
216
- x = self.final_layer(x)
217
- return x
218
-
219
- def forward(self, x, timesteps, y=None):
220
- # print(x.shape)
221
- batch, frame, _, _, _ = x.shape
222
- # θΏ™ι‡ŒrearrangeεŽζ―ιš”fζ˜―εŒδΈ€δΈͺ视钑
223
- x = einops.rearrange(x, 'b f c h w -> (b f) c h w') # 3 16 4 256 256
224
- x = self.patch_embed(x) # 48, 256, 1152
225
- B, L, D = x.shape
226
-
227
- time_token = self.time_embed(timestep_embedding(timesteps, self.embed_dim)) # 3, 1152
228
- # timestep_spatialηš„repeatιœ€θ¦δΏθ―ζ―fεΈ§δΈΊεŒδΈ€δΈͺtimesteps
229
- time_token_spatial = einops.repeat(time_token, 'n d -> (n c) d', c=frame) # 48, 1152
230
- time_token_spatial = time_token_spatial.unsqueeze(dim=1) # 48, 1, 1152
231
- x = torch.cat((time_token_spatial, x), dim=1) # 48, 257, 1152
232
-
233
- if y is not None:
234
- label_emb = self.label_emb(y)
235
- label_emb = label_emb.unsqueeze(dim=1)
236
- x = torch.cat((label_emb, x), dim=1)
237
- x = x + self.pos_embed
238
-
239
- skips = []
240
- for i in range(0, len(self.in_blocks), 2):
241
- # print('The {}-th run'.format(i))
242
- spatial_block, time_block = self.in_blocks[i:i+2]
243
- x = spatial_block(x)
244
-
245
- # add time embeddings and conduct attention as frame.
246
- x = einops.rearrange(x, '(b f) t d -> (b t) f d', b=batch) # t 代葨单帧tokenζ•°; 771, 16, 1152; 771: 3 * 257
247
- skips.append(x)
248
- # print(x.shape)
249
-
250
- if i == 0:
251
- x = x + self.frame_embed # 771, 16, 1152
252
-
253
- x = time_block(x)
254
-
255
- x = einops.rearrange(x, '(b t) f d -> (b f) t d', b=batch) # 48, 257, 1152
256
- skips.append(x)
257
-
258
- x = self.mid_block(x)
259
-
260
- for i in range(0, len(self.out_blocks), 2):
261
- # print('The {}-th run'.format(i))
262
- spatial_block, time_block = self.out_blocks[i:i+2]
263
- x = spatial_block(x, skips.pop())
264
-
265
- # add time embeddings and conduct attention as frame.
266
- x = einops.rearrange(x, '(b f) t d -> (b t) f d', b=batch) # t 代葨单帧tokenζ•°; 771, 16, 1152; 771: 3 * 257
267
-
268
- x = time_block(x, skips.pop())
269
-
270
- x = einops.rearrange(x, '(b t) f d -> (b f) t d', b=batch) # 48, 256, 1152
271
-
272
-
273
- x = self.norm(x)
274
- x = self.decoder_pred(x)
275
- assert x.size(1) == self.extras + L
276
- x = x[:, self.extras:, :]
277
- x = unpatchify(x, self.in_chans)
278
- x = self.final_layer(x)
279
- x = einops.rearrange(x, '(b f) c h w -> b f c h w', b=batch)
280
- # print(x.shape)
281
- return x
282
-
283
- def UViT_XL_2(**kwargs):
284
- return UViT(patch_size=2, in_chans=4, embed_dim=1152, depth=28,
285
- num_heads=16, mlp_ratio=4, qkv_bias=False, mlp_time_embed=4,
286
- use_checkpoint=True, conv=False, **kwargs)
287
-
288
- def UViT_L_2(**kwargs):
289
- return UViT(patch_size=2, in_chans=4, embed_dim=1024, depth=20,
290
- num_heads=16, mlp_ratio=4, qkv_bias=False, mlp_time_embed=False,
291
- use_checkpoint=True, **kwargs)
292
-
293
- # ζ²‘ζœ‰Lδ»₯δΈ‹ηš„οΌŒUViTδΈ­Lδ»₯δΈ‹ηš„img_sizeδΈΊ64
294
-
295
- UViT_models = {
296
- 'UViT-XL/2': UViT_XL_2, 'UViT-L/2': UViT_L_2
297
- }
298
-
299
-
300
- if __name__ == '__main__':
301
-
302
-
303
- nnet = UViT_XL_2().cuda()
304
-
305
- imgs = torch.randn(3, 16, 4, 32, 32).cuda()
306
- timestpes = torch.tensor([1, 2, 3]).cuda()
307
-
308
- outputs = nnet(imgs, timestpes)
309
- print(outputs.shape)
310
-