File size: 13,271 Bytes
c00df70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
Logging to ./
creating model and diffusion...
creating 3DAE...
length of vit_decoder.blocks: 24
init pos_embed with sincos
length of vit_decoder.blocks: 24
ignore dim_up_mlp:  True
AE(
  (encoder): MVEncoderGSDynamicInp(
    (conv_in): Conv2d(10, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (down): ModuleList(
      (0): Module(
        (block): ModuleList(
          (0): ResnetBlock(
            (norm1): GroupNorm(32, 64, eps=1e-06, affine=True)
            (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 64, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
        )
        (attn): ModuleList()
        (downsample): Downsample(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2))
        )
      )
      (1): Module(
        (block): ModuleList(
          (0): ResnetBlock(
            (norm1): GroupNorm(32, 64, eps=1e-06, affine=True)
            (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (nin_shortcut): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1))
          )
        )
        (attn): ModuleList()
        (downsample): Downsample(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))
        )
      )
      (2): Module(
        (block): ModuleList(
          (0): ResnetBlock(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
          )
        )
        (attn): ModuleList()
        (downsample): Downsample(
          (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))
        )
      )
      (3): Module(
        (block): ModuleList(
          (0): ResnetBlock(
            (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
            (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
        )
        (attn): ModuleList()
      )
    )
    (mid): Module(
      (block_1): ResnetBlock(
        (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
        (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (attn_1): SpatialTransformer3D(
        (norm): GroupNorm(32, 256, eps=1e-06, affine=True)
        (proj_in): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
        (transformer_blocks): ModuleList(
          (0): BasicTransformerBlock3D(
            (attn1): MemoryEfficientCrossAttention(
              (to_q): Linear(in_features=512, out_features=512, bias=False)
              (to_k): Linear(in_features=512, out_features=512, bias=False)
              (q_norm): Identity()
              (k_norm): Identity()
              (to_v): Linear(in_features=512, out_features=512, bias=False)
              (to_out): Sequential(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Dropout(p=0.0, inplace=False)
              )
            )
            (ff): FeedForward(
              (net): Sequential(
                (0): GEGLU(
                  (proj): Linear(in_features=512, out_features=4096, bias=True)
                )
                (1): Dropout(p=0.0, inplace=False)
                (2): Linear(in_features=2048, out_features=512, bias=True)
              )
            )
            (attn2): MemoryEfficientCrossAttention(
              (to_q): Linear(in_features=512, out_features=512, bias=False)
              (to_k): Linear(in_features=512, out_features=512, bias=False)
              (q_norm): Identity()
              (k_norm): Identity()
              (to_v): Linear(in_features=512, out_features=512, bias=False)
              (to_out): Sequential(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Dropout(p=0.0, inplace=False)
              )
            )
            (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          )
        )
        (proj_out): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
      )
      (block_2): ResnetBlock(
        (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
        (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (norm_out): GroupNorm(32, 256, eps=1e-06, affine=True)
    (conv_out): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
  (decoder): RodinSR_256_fusionv6_ConvQuant_liteSR_dinoInit3DAttn_SD_B_3L_C_withrollout_withSD_D_ditDecoder(
    (superresolution): ModuleDict(
      (ldm_upsample): PatchEmbedTriplane(
        (proj): Conv2d(12, 3072, kernel_size=(2, 2), stride=(2, 2), groups=3)
        (norm): Identity()
      )
      (quant_conv): Conv2d(24, 24, kernel_size=(1, 1), stride=(1, 1), groups=3)
      (conv_sr): Decoder(
        (conv_in): Conv2d(1024, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (mid): Module(
          (block_1): ResnetBlock(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
          (attn_1): MemoryEfficientAttnBlock(
            (norm): GroupNorm(32, 128, eps=1e-06, affine=True)
            (q): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))
            (k): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))
            (v): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))
            (proj_out): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))
          )
          (block_2): ResnetBlock(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          )
        )
        (up): ModuleList(
          (0): Module(
            (block): ModuleList(
              (0): ResnetBlock(
                (norm1): GroupNorm(32, 64, eps=1e-06, affine=True)
                (conv1): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (norm2): GroupNorm(32, 32, eps=1e-06, affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (nin_shortcut): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
              )
              (1): ResnetBlock(
                (norm1): GroupNorm(32, 32, eps=1e-06, affine=True)
                (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (norm2): GroupNorm(32, 32, eps=1e-06, affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              )
            )
            (attn): ModuleList()
          )
          (1): Module(
            (block): ModuleList(
              (0-1): 2 x ResnetBlock(
                (norm1): GroupNorm(32, 64, eps=1e-06, affine=True)
                (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (norm2): GroupNorm(32, 64, eps=1e-06, affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              )
            )
            (attn): ModuleList()
            (upsample): Upsample(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
          )
          (2): Module(
            (block): ModuleList(
              (0): ResnetBlock(
                (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
                (conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (norm2): GroupNorm(32, 64, eps=1e-06, affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (nin_shortcut): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1))
              )
              (1): ResnetBlock(
                (norm1): GroupNorm(32, 64, eps=1e-06, affine=True)
                (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (norm2): GroupNorm(32, 64, eps=1e-06, affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              )
            )
            (attn): ModuleList()
            (upsample): Upsample(
              (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
          )
          (3): Module(
            (block): ModuleList(
              (0-1): 2 x ResnetBlock(
                (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
                (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              )
            )
            (attn): ModuleList()
            (upsample): Upsample(
              (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
          )
        )
        (norm_out): GroupNorm(32, 32, eps=1e-06, affine=True)
        (conv_out): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (vit_decoder): DiT2(
      (blocks): ModuleList(
        (0-23): 24 x DiTBlock2(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=False)
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=False)
          (attn): MemEffAttention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (q_norm): Identity()
            (k_norm): Identity()
          )
          (mlp): FusedMLP(
            (mlp): Sequential(
              (0): Linear(in_features=1024, out_features=4096, bias=False)
              (1): FusedDropoutBias(
                (activation_pytorch): GELU(approximate='none')
              )
              (2): Linear(in_features=4096, out_features=1024, bias=False)
              (3): FusedDropoutBias(
                (activation_pytorch): Identity()
              )
            )
          )
          (adaLN_modulation): Sequential(
            (0): SiLU()
            (1): Linear(in_features=1024, out_features=6144, bias=True)
          )
        )
      )
    )
    (triplane_decoder): Triplane(
      (renderer): ImportanceRenderer(
        (ray_marcher): MipRayMarcher2()
      )
      (ray_sampler): PatchRaySampler()
      (decoder): OSGDecoder(
        (net): Sequential(
          (0): FullyConnectedLayer(in_features=32, out_features=64, activation=linear)
          (1): Softplus(beta=1.0, threshold=20.0)
          (2): FullyConnectedLayer(in_features=64, out_features=4, activation=linear)
        )
      )
    )
    (decoder_pred): None
  )
)
create dataset
joint_denoise_rec_model enables AMP to accelerate training
mark joint_denoise_rec_model loading 
loading model from huggingface: yslan/LN3Diff/checkpoints/objaverse/objaverse-dit/i23d/model_joint_denoise_rec_model2990000.safetensors...
mark joint_denoise_rec_model loading finished