SsharvienKumar commited on
Commit
1054262
·
verified ·
1 Parent(s): b682f4c

Upload 54 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoints/Cataract-1K/graphencoder_masked/best_val_loss.pth +3 -0
  2. checkpoints/Cataract-1K/graphencoder_segclip/best_val_loss.pth +3 -0
  3. checkpoints/Cataract-1K/vae_vid_diffusion/vae/config.json +37 -0
  4. checkpoints/Cataract-1K/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors +3 -0
  5. checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/optimizer.bin +3 -0
  6. checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/random_states_0.pkl +3 -0
  7. checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/scheduler.bin +3 -0
  8. checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/unet/config.json +79 -0
  9. checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
  10. checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/optimizer.bin +3 -0
  11. checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/random_states_0.pkl +3 -0
  12. checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/scheduler.bin +3 -0
  13. checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/unet/config.json +79 -0
  14. checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
  15. checkpoints/Cataract-1K/vqgan_image/checkpoint.ckpt +3 -0
  16. checkpoints/Cataract-1K/vqgan_image/config.yaml +45 -0
  17. checkpoints/Cataract-1K/vqgan_segmentation/checkpoint.ckpt +3 -0
  18. checkpoints/Cataract-1K/vqgan_segmentation/config.yaml +42 -0
  19. checkpoints/Cataracts-50/graphencoder_masked/best_val_loss.pth +3 -0
  20. checkpoints/Cataracts-50/graphencoder_segclip/best_val_loss.pth +3 -0
  21. checkpoints/Cataracts-50/vae_vid_diffusion/vae/config.json +37 -0
  22. checkpoints/Cataracts-50/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors +3 -0
  23. checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/optimizer.bin +3 -0
  24. checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/random_states_0.pkl +3 -0
  25. checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/scheduler.bin +3 -0
  26. checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/unet/config.json +79 -0
  27. checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
  28. checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/optimizer.bin +3 -0
  29. checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/random_states_0.pkl +3 -0
  30. checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/scheduler.bin +3 -0
  31. checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/unet/config.json +79 -0
  32. checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
  33. checkpoints/Cataracts-50/vqgan_image/checkpoint.ckpt +3 -0
  34. checkpoints/Cataracts-50/vqgan_image/config.yaml +45 -0
  35. checkpoints/Cataracts-50/vqgan_segmentation/checkpoint.ckpt +3 -0
  36. checkpoints/Cataracts-50/vqgan_segmentation/config.yaml +42 -0
  37. checkpoints/Cholec-80/graphencoder_masked/best_val_loss.pth +3 -0
  38. checkpoints/Cholec-80/graphencoder_segclip/best_val_loss.pth +3 -0
  39. checkpoints/Cholec-80/vae_vid_diffusion/vae/config.json +37 -0
  40. checkpoints/Cholec-80/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors +3 -0
  41. checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/optimizer.bin +3 -0
  42. checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/random_states_0.pkl +3 -0
  43. checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/scheduler.bin +3 -0
  44. checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/unet/config.json +79 -0
  45. checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
  46. checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/optimizer.bin +3 -0
  47. checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/random_states_0.pkl +3 -0
  48. checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/scheduler.bin +3 -0
  49. checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/unet/config.json +79 -0
  50. checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
checkpoints/Cataract-1K/graphencoder_masked/best_val_loss.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634d9725ecada96f77e84518ee09f935a2e4574f368436c27a68e9c818ad6814
3
+ size 350320133
checkpoints/Cataract-1K/graphencoder_segclip/best_val_loss.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2df2b1f88309f3e6c3f08a18c68696b68bd728f13aeacdd011f83e83cda7745f
3
+ size 277289009
checkpoints/Cataract-1K/vae_vid_diffusion/vae/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.31.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 2,
23
+ "mid_block_add_attention": true,
24
+ "norm_num_groups": 32,
25
+ "out_channels": 3,
26
+ "sample_size": 512,
27
+ "scaling_factor": 0.18215,
28
+ "shift_factor": null,
29
+ "up_block_types": [
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D"
34
+ ],
35
+ "use_post_quant_conv": true,
36
+ "use_quant_conv": true
37
+ }
checkpoints/Cataract-1K/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc2d8efeafe0511f546f6d56f39c325bf236f98df00a90736faa35bdefe47efc
3
+ size 334643268
checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c3b990077004618f00492a46bafa029f5fd4f8caa8ffecd82c6b91c97f01d11
3
+ size 9847488907
checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc020875c8ab2c3e98daf2aeed4f0b7edf0b666718839dd67ca211b76dd67a1
3
+ size 15060
checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c776fe9e8b0b55b3a6dc8e06a2d0ccf5cecf4f2ff67df8c22794b8a6e2a9d288
3
+ size 1000
checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/unet/config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VideoLDMUNet3DConditionModel",
3
+ "_diffusers_version": "0.21.2",
4
+ "_name_or_path": "",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "augment_temporal_attention": true,
17
+ "block_out_channels": [
18
+ 320,
19
+ 640,
20
+ 1280,
21
+ 1280
22
+ ],
23
+ "center_input_sample": false,
24
+ "class_embed_type": "identity",
25
+ "class_embeddings_concat": true,
26
+ "conv_in_kernel": 3,
27
+ "conv_out_kernel": 3,
28
+ "cross_attention_dim": 1024,
29
+ "cross_attention_norm": null,
30
+ "down_block_types": [
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "CrossAttnDownBlock2D",
34
+ "DownBlock2D"
35
+ ],
36
+ "downsample_padding": 1,
37
+ "dropout": 0.0,
38
+ "dual_cross_attention": false,
39
+ "encoder_hid_dim": null,
40
+ "encoder_hid_dim_type": null,
41
+ "first_frame_condition_mode": "concat",
42
+ "flip_sin_to_cos": true,
43
+ "freq_shift": 0,
44
+ "in_channels": 4,
45
+ "layers_per_block": 2,
46
+ "mid_block_only_cross_attention": null,
47
+ "mid_block_scale_factor": 1,
48
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
49
+ "n_frames": 16,
50
+ "n_temp_heads": 8,
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": null,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "sample_size": 64,
62
+ "temp_pos_embedding": "rotary",
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": 512,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_frame_stride_condition": false,
77
+ "use_linear_projection": true,
78
+ "use_temporal": true
79
+ }
checkpoints/Cataract-1K/video_diffusion_img_graph/checkpoint/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:103079d3c7e0f7edfb51eea9e98d15644bf1fa28054bdb4b7d964b188c073bff
3
+ size 4964732628
checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc52188ffd3c45d870cab39802f222ab0782a3694eb6150ee128f776efa40c16
3
+ size 9847488907
checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c098ce11373ec59e691dab19cc25a6548b2a666923316903127bc0553d1b237
3
+ size 15060
checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ad0081b463218f8db5c315a84f2cc987a7e6047b334bed692a360c3c7e1ad21
3
+ size 1000
checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/unet/config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VideoLDMUNet3DConditionModel",
3
+ "_diffusers_version": "0.21.2",
4
+ "_name_or_path": "",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "augment_temporal_attention": true,
17
+ "block_out_channels": [
18
+ 320,
19
+ 640,
20
+ 1280,
21
+ 1280
22
+ ],
23
+ "center_input_sample": false,
24
+ "class_embed_type": "identity",
25
+ "class_embeddings_concat": true,
26
+ "conv_in_kernel": 3,
27
+ "conv_out_kernel": 3,
28
+ "cross_attention_dim": 1024,
29
+ "cross_attention_norm": null,
30
+ "down_block_types": [
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "CrossAttnDownBlock2D",
34
+ "DownBlock2D"
35
+ ],
36
+ "downsample_padding": 1,
37
+ "dropout": 0.0,
38
+ "dual_cross_attention": false,
39
+ "encoder_hid_dim": null,
40
+ "encoder_hid_dim_type": null,
41
+ "first_frame_condition_mode": "none",
42
+ "flip_sin_to_cos": true,
43
+ "freq_shift": 0,
44
+ "in_channels": 4,
45
+ "layers_per_block": 2,
46
+ "mid_block_only_cross_attention": null,
47
+ "mid_block_scale_factor": 1,
48
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
49
+ "n_frames": 16,
50
+ "n_temp_heads": 8,
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": null,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "sample_size": 64,
62
+ "temp_pos_embedding": "rotary",
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": 512,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_frame_stride_condition": false,
77
+ "use_linear_projection": true,
78
+ "use_temporal": true
79
+ }
checkpoints/Cataract-1K/video_diffusion_ximg_graph/checkpoint/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8d83c70bf76e7f21b29b2e277a61bc569489cecbceb7a721adbb918de724158
3
+ size 4964732628
checkpoints/Cataract-1K/vqgan_image/checkpoint.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2165e3562bf8634fbd2f7be018140049bd62241921ae53994d0a6a25729e650
3
+ size 878906717
checkpoints/Cataract-1K/vqgan_image/config.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: sg2vid.taming.taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 1
6
+ n_embed: 16384
7
+ image_key: image
8
+ ddconfig:
9
+ double_z: false
10
+ z_channels: 1
11
+ resolution: 128
12
+ in_channels: 3
13
+ out_ch: 3
14
+ ch: 128
15
+ ch_mult:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ - 4
20
+ num_res_blocks: 2
21
+ attn_resolutions:
22
+ - 16
23
+ dropout: 0.0
24
+ lossconfig:
25
+ target: sg2vid.taming.taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26
+ params:
27
+ disc_conditional: false
28
+ disc_in_channels: 3
29
+ disc_num_layers: 2
30
+ disc_start: 1
31
+ disc_weight: 0.6
32
+ codebook_weight: 1.0
33
+ data:
34
+ target: main.DataModuleFromConfig
35
+ params:
36
+ batch_size: 32
37
+ num_workers: 16
38
+ train:
39
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataract1KTrain
40
+ params:
41
+ size: 128
42
+ validation:
43
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataract1KValidation
44
+ params:
45
+ size: 128
checkpoints/Cataract-1K/vqgan_segmentation/checkpoint.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a2829d1b145d3ab72bce63ca036c47476ca21b01f04010c8b294a05e299d17
3
+ size 812256235
checkpoints/Cataract-1K/vqgan_segmentation/config.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: sg2vid.taming.taming.models.vqgan.VQSegmentationModel
4
+ params:
5
+ embed_dim: 1
6
+ n_embed: 8192
7
+ image_key: segmentation
8
+ ddconfig:
9
+ double_z: false
10
+ z_channels: 1
11
+ resolution: 128
12
+ in_channels: 14
13
+ out_ch: 14
14
+ ch: 128
15
+ ch_mult:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ - 4
20
+ num_res_blocks: 2
21
+ attn_resolutions:
22
+ - 16
23
+ dropout: 0.0
24
+ lossconfig:
25
+ target: sg2vid.taming.taming.modules.losses.segmentation.BCELossWithQuant
26
+ params:
27
+ codebook_weight: 1.0
28
+ data:
29
+ target: main.DataModuleFromConfig
30
+ params:
31
+ batch_size: 32
32
+ num_workers: 16
33
+ train:
34
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataract1KTrain
35
+ params:
36
+ size: 128
37
+ num_label: 14
38
+ validation:
39
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataract1KValidation
40
+ params:
41
+ size: 128
42
+ num_label: 14
checkpoints/Cataracts-50/graphencoder_masked/best_val_loss.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d08129622c026da222fe7dd00142a027dc6be62a84d73364b6e282e1ddaeb8a
3
+ size 350352901
checkpoints/Cataracts-50/graphencoder_segclip/best_val_loss.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a042fb16d1e6c849f5624febe45d4a462751ab394fed32bd1d03a9da839f3c
3
+ size 277358705
checkpoints/Cataracts-50/vae_vid_diffusion/vae/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.31.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 2,
23
+ "mid_block_add_attention": true,
24
+ "norm_num_groups": 32,
25
+ "out_channels": 3,
26
+ "sample_size": 512,
27
+ "scaling_factor": 0.18215,
28
+ "shift_factor": null,
29
+ "up_block_types": [
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D"
34
+ ],
35
+ "use_post_quant_conv": true,
36
+ "use_quant_conv": true
37
+ }
checkpoints/Cataracts-50/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2694030ebd5c705b150907d3b6c97af3d8da9e4cf15d47b6aab460eac7483be6
3
+ size 334643268
checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53b43b00b67519794a47b20210a160a544d71e5981be0269b97367616e6c6465
3
+ size 9847488907
checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8b88ef07a3055934edb567215b6fef7e105165af7494b7b0e30a4c33540f880
3
+ size 15060
checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94f607683c090632e8d779f519774dd0a2133d20ff4299b9edecbd7b1661901e
3
+ size 1000
checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/unet/config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VideoLDMUNet3DConditionModel",
3
+ "_diffusers_version": "0.21.2",
4
+ "_name_or_path": "",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "augment_temporal_attention": true,
17
+ "block_out_channels": [
18
+ 320,
19
+ 640,
20
+ 1280,
21
+ 1280
22
+ ],
23
+ "center_input_sample": false,
24
+ "class_embed_type": "identity",
25
+ "class_embeddings_concat": true,
26
+ "conv_in_kernel": 3,
27
+ "conv_out_kernel": 3,
28
+ "cross_attention_dim": 1024,
29
+ "cross_attention_norm": null,
30
+ "down_block_types": [
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "CrossAttnDownBlock2D",
34
+ "DownBlock2D"
35
+ ],
36
+ "downsample_padding": 1,
37
+ "dropout": 0.0,
38
+ "dual_cross_attention": false,
39
+ "encoder_hid_dim": null,
40
+ "encoder_hid_dim_type": null,
41
+ "first_frame_condition_mode": "concat",
42
+ "flip_sin_to_cos": true,
43
+ "freq_shift": 0,
44
+ "in_channels": 4,
45
+ "layers_per_block": 2,
46
+ "mid_block_only_cross_attention": null,
47
+ "mid_block_scale_factor": 1,
48
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
49
+ "n_frames": 16,
50
+ "n_temp_heads": 8,
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": null,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "sample_size": 64,
62
+ "temp_pos_embedding": "rotary",
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": 512,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_frame_stride_condition": false,
77
+ "use_linear_projection": true,
78
+ "use_temporal": true
79
+ }
checkpoints/Cataracts-50/video_diffusion_img_graph/checkpoint/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3ef3e4d71c90955d367b1559ebd42d471ac1bef74113439091a7a28b6ea17e1
3
+ size 4964732628
checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05c1d1f09eacbcbc9151d2e1163ffb0fa5f143d1bfbf62d205bfecab63395a35
3
+ size 9847488907
checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc05163b92dcc6f53896656fcf2b523cea0167cac52282df8a56ec3c9c8a0e7
3
+ size 15060
checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94f607683c090632e8d779f519774dd0a2133d20ff4299b9edecbd7b1661901e
3
+ size 1000
checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/unet/config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VideoLDMUNet3DConditionModel",
3
+ "_diffusers_version": "0.21.2",
4
+ "_name_or_path": "",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "augment_temporal_attention": true,
17
+ "block_out_channels": [
18
+ 320,
19
+ 640,
20
+ 1280,
21
+ 1280
22
+ ],
23
+ "center_input_sample": false,
24
+ "class_embed_type": "identity",
25
+ "class_embeddings_concat": true,
26
+ "conv_in_kernel": 3,
27
+ "conv_out_kernel": 3,
28
+ "cross_attention_dim": 1024,
29
+ "cross_attention_norm": null,
30
+ "down_block_types": [
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "CrossAttnDownBlock2D",
34
+ "DownBlock2D"
35
+ ],
36
+ "downsample_padding": 1,
37
+ "dropout": 0.0,
38
+ "dual_cross_attention": false,
39
+ "encoder_hid_dim": null,
40
+ "encoder_hid_dim_type": null,
41
+ "first_frame_condition_mode": "none",
42
+ "flip_sin_to_cos": true,
43
+ "freq_shift": 0,
44
+ "in_channels": 4,
45
+ "layers_per_block": 2,
46
+ "mid_block_only_cross_attention": null,
47
+ "mid_block_scale_factor": 1,
48
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
49
+ "n_frames": 16,
50
+ "n_temp_heads": 8,
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": null,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "sample_size": 64,
62
+ "temp_pos_embedding": "rotary",
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": 512,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_frame_stride_condition": false,
77
+ "use_linear_projection": true,
78
+ "use_temporal": true
79
+ }
checkpoints/Cataracts-50/video_diffusion_ximg_graph/checkpoint/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bc32611ff6f7053a5f6b4ce1569b1ba1f501cc0b1076bfcc6dc7a67f4e69969
3
+ size 4964732628
checkpoints/Cataracts-50/vqgan_image/checkpoint.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8fb793f91878b1887bf1b94c2aadf2e47f979b47aac74dffbf5732e0ba3c55
3
+ size 878906653
checkpoints/Cataracts-50/vqgan_image/config.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: sg2vid.taming.taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 1
6
+ n_embed: 16384
7
+ image_key: image
8
+ ddconfig:
9
+ double_z: false
10
+ z_channels: 1
11
+ resolution: 128
12
+ in_channels: 3
13
+ out_ch: 3
14
+ ch: 128
15
+ ch_mult:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ - 4
20
+ num_res_blocks: 2
21
+ attn_resolutions:
22
+ - 16
23
+ dropout: 0.0
24
+ lossconfig:
25
+ target: sg2vid.taming.taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26
+ params:
27
+ disc_conditional: false
28
+ disc_in_channels: 3
29
+ disc_num_layers: 2
30
+ disc_start: 1
31
+ disc_weight: 0.6
32
+ codebook_weight: 1.0
33
+ data:
34
+ target: main.DataModuleFromConfig
35
+ params:
36
+ batch_size: 32
37
+ num_workers: 16
38
+ train:
39
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataracts50Train
40
+ params:
41
+ size: 128
42
+ validation:
43
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataracts50Validation
44
+ params:
45
+ size: 128
checkpoints/Cataracts-50/vqgan_segmentation/checkpoint.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55cad5263318616b1fe523bac2c13f2d04d03a5ecfe1e9bd3246e760488c524f
3
+ size 812367083
checkpoints/Cataracts-50/vqgan_segmentation/config.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: sg2vid.taming.taming.models.vqgan.VQSegmentationModel
4
+ params:
5
+ embed_dim: 1
6
+ n_embed: 8192
7
+ image_key: segmentation
8
+ ddconfig:
9
+ double_z: false
10
+ z_channels: 1
11
+ resolution: 128
12
+ in_channels: 18
13
+ out_ch: 18
14
+ ch: 128
15
+ ch_mult:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ - 4
20
+ num_res_blocks: 2
21
+ attn_resolutions:
22
+ - 16
23
+ dropout: 0.0
24
+ lossconfig:
25
+ target: sg2vid.taming.taming.modules.losses.segmentation.BCELossWithQuant
26
+ params:
27
+ codebook_weight: 1.0
28
+ data:
29
+ target: main.DataModuleFromConfig
30
+ params:
31
+ batch_size: 32
32
+ num_workers: 16
33
+ train:
34
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataracts50Train
35
+ params:
36
+ size: 128
37
+ num_label: 18
38
+ validation:
39
+ target: sg2vid.taming.taming.data.surgicaldataset.Cataracts50Validation
40
+ params:
41
+ size: 128
42
+ num_label: 18
checkpoints/Cholec-80/graphencoder_masked/best_val_loss.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16ec1b12a632c2467602a5860c771469d86f3c60c6fbb9cc9edd21ee2fb53352
3
+ size 350311941
checkpoints/Cholec-80/graphencoder_segclip/best_val_loss.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9894478455b8b82940187caff0f40aaea580a2d89a47f55c2df2d8632c48f658
3
+ size 277271601
checkpoints/Cholec-80/vae_vid_diffusion/vae/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.31.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 2,
23
+ "mid_block_add_attention": true,
24
+ "norm_num_groups": 32,
25
+ "out_channels": 3,
26
+ "sample_size": 512,
27
+ "scaling_factor": 0.18215,
28
+ "shift_factor": null,
29
+ "up_block_types": [
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D"
34
+ ],
35
+ "use_post_quant_conv": true,
36
+ "use_quant_conv": true
37
+ }
checkpoints/Cholec-80/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b24c6234ffb7d27837fbb7203a70b34ad064acfec9c15f07bc15a03c60bd7b3e
3
+ size 334643268
checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cc8f5b6bf8603e186d3b15d0f6cbc7b8ffbfd150a14db228295995dcc6fe68f
3
+ size 9847488907
checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02538c587dbd67c0e812d199aef09f8595fc8b5f11f89c0dd46f32cfc31c62a6
3
+ size 15060
checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c776fe9e8b0b55b3a6dc8e06a2d0ccf5cecf4f2ff67df8c22794b8a6e2a9d288
3
+ size 1000
checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/unet/config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VideoLDMUNet3DConditionModel",
3
+ "_diffusers_version": "0.21.2",
4
+ "_name_or_path": "",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "augment_temporal_attention": true,
17
+ "block_out_channels": [
18
+ 320,
19
+ 640,
20
+ 1280,
21
+ 1280
22
+ ],
23
+ "center_input_sample": false,
24
+ "class_embed_type": "identity",
25
+ "class_embeddings_concat": true,
26
+ "conv_in_kernel": 3,
27
+ "conv_out_kernel": 3,
28
+ "cross_attention_dim": 1024,
29
+ "cross_attention_norm": null,
30
+ "down_block_types": [
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "CrossAttnDownBlock2D",
34
+ "DownBlock2D"
35
+ ],
36
+ "downsample_padding": 1,
37
+ "dropout": 0.0,
38
+ "dual_cross_attention": false,
39
+ "encoder_hid_dim": null,
40
+ "encoder_hid_dim_type": null,
41
+ "first_frame_condition_mode": "concat",
42
+ "flip_sin_to_cos": true,
43
+ "freq_shift": 0,
44
+ "in_channels": 4,
45
+ "layers_per_block": 2,
46
+ "mid_block_only_cross_attention": null,
47
+ "mid_block_scale_factor": 1,
48
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
49
+ "n_frames": 16,
50
+ "n_temp_heads": 8,
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": null,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "sample_size": 64,
62
+ "temp_pos_embedding": "rotary",
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": 512,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_frame_stride_condition": false,
77
+ "use_linear_projection": true,
78
+ "use_temporal": true
79
+ }
checkpoints/Cholec-80/video_diffusion_img_graph/checkpoint/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a68ccc867f7967f8081093660b34aa254d5ea68562a511a9c2af0a0f658a16cd
3
+ size 4964732628
checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffc90366c234b263b45ec409e013c24e5f3446d8c9bd22c5fbcfc692324420c1
3
+ size 9847488907
checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:302e5b8d4b016f6b594da211ac28069cb4e864b1d752b01a72c401783deca4fb
3
+ size 15060
checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94f607683c090632e8d779f519774dd0a2133d20ff4299b9edecbd7b1661901e
3
+ size 1000
checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/unet/config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VideoLDMUNet3DConditionModel",
3
+ "_diffusers_version": "0.21.2",
4
+ "_name_or_path": "",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "augment_temporal_attention": true,
17
+ "block_out_channels": [
18
+ 320,
19
+ 640,
20
+ 1280,
21
+ 1280
22
+ ],
23
+ "center_input_sample": false,
24
+ "class_embed_type": "identity",
25
+ "class_embeddings_concat": true,
26
+ "conv_in_kernel": 3,
27
+ "conv_out_kernel": 3,
28
+ "cross_attention_dim": 1024,
29
+ "cross_attention_norm": null,
30
+ "down_block_types": [
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "CrossAttnDownBlock2D",
34
+ "DownBlock2D"
35
+ ],
36
+ "downsample_padding": 1,
37
+ "dropout": 0.0,
38
+ "dual_cross_attention": false,
39
+ "encoder_hid_dim": null,
40
+ "encoder_hid_dim_type": null,
41
+ "first_frame_condition_mode": "none",
42
+ "flip_sin_to_cos": true,
43
+ "freq_shift": 0,
44
+ "in_channels": 4,
45
+ "layers_per_block": 2,
46
+ "mid_block_only_cross_attention": null,
47
+ "mid_block_scale_factor": 1,
48
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
49
+ "n_frames": 16,
50
+ "n_temp_heads": 8,
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": null,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "sample_size": 64,
62
+ "temp_pos_embedding": "rotary",
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": 512,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_frame_stride_condition": false,
77
+ "use_linear_projection": true,
78
+ "use_temporal": true
79
+ }
checkpoints/Cholec-80/video_diffusion_ximg_graph/checkpoint/unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7deb44d5d8b715315bcc4656ea484f4942072ef0ddf942a794056ca645c08b3f
3
+ size 4964732628