bubbliiiing commited on
Commit
fcb0891
1 Parent(s): 940966b

Update config

Browse files
Files changed (3) hide show
  1. README.md +10 -11
  2. transformer/config.json +10 -1
  3. vae/config.json +32 -11
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- license: apache-2.0
3
- language:
4
- - en
5
- tags:
6
- - text-generation-inference
7
  ---
 
8
  # 📷 EasyAnimate | An End-to-End Solution for High-Resolution and Long Video Generation
9
  😊 EasyAnimate is an end-to-end solution for generating high-resolution and long videos. We can train transformer based diffusion generators, train VAEs for processing long videos, and preprocess metadata.
10
 
@@ -57,13 +58,11 @@ cd ../../
57
  # Model zoo
58
 
59
  EasyAnimateV3:
60
- | Name | Type | Storage Space | Url | Hugging Face | Description |
61
  |--|--|--|--|--|--|
62
-
63
- | EasyAnimateV3-XL-2-InP-512x512.tar | EasyAnimateV3 | 16.2GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV3-XL-2-InP-512x512.tar) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-512x512) | EasyAnimateV3 official weights for 512x512 image to video resolution. Training with 144 frames and fps 24 |
64
- | EasyAnimateV3-XL-2-InP-768x768.tar | EasyAnimateV3 | 16.2GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV3-XL-2-InP-768x768.tar) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-768x768) | EasyAnimateV3 official weights for 768x768 image to video resolution. Training with 144 frames and fps 24 |
65
- | EasyAnimateV3-XL-2-InP-960x960.tar | EasyAnimateV3 | 16.2GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV3-XL-2-InP-960x960.tar) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-960x960) | EasyAnimateV3 official weights for 960x960 image to video resolution. Training with 144 frames and fps 24 |
66
- | easyanimatev3_minimalism_lora.safetensors | Lora of Pixart | 485.1MB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Personalized_Model/easyanimatev2_minimalism_lora.safetensors) | - | A lora training with a specifial type images. Images can be downloaded from [Url](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/asset/v2/Minimalism.zip). |
67
 
68
  # Algorithm Detailed
69
  ### 1. Data Preprocessing
 
1
  ---
2
+ frameworks:
3
+ - Pytorch
4
+ license: Apache License 2.0
5
+ tasks:
6
+ - text-to-video-synthesis
7
  ---
8
+
9
  # 📷 EasyAnimate | An End-to-End Solution for High-Resolution and Long Video Generation
10
  😊 EasyAnimate is an end-to-end solution for generating high-resolution and long videos. We can train transformer based diffusion generators, train VAEs for processing long videos, and preprocess metadata.
11
 
 
58
  # Model zoo
59
 
60
  EasyAnimateV3:
61
+ | Name | Type | Storage Space | Hugging Face | Model Scope | Description |
62
  |--|--|--|--|--|--|
63
+ | EasyAnimateV3-XL-2-InP-512x512.tar | EasyAnimateV3 | 18.2GB | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-512x512)| [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV3-XL-2-InP-512x512) | EasyAnimateV3 official weights for 512x512 text and image to video resolution. Training with 144 frames and fps 24 |
64
+ | EasyAnimateV3-XL-2-InP-768x768.tar | EasyAnimateV3 | 18.2GB | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-768x768) | [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV3-XL-2-InP-768x768) | EasyAnimateV3 official weights for 768x768 text and image to video resolution. Training with 144 frames and fps 24 |
65
+ | EasyAnimateV3-XL-2-InP-960x960.tar | EasyAnimateV3 | 18.2GB | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV3-XL-2-InP-960x960) | [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV3-XL-2-InP-960x960) | EasyAnimateV3 official weights for 960x960 text and image to video resolution. Training with 144 frames and fps 24 |
 
 
66
 
67
  # Algorithm Detailed
68
  ### 1. Data Preprocessing
transformer/config.json CHANGED
@@ -1,7 +1,9 @@
1
  {
2
  "_class_name": "Transformer3DModel",
3
- "_diffusers_version": "0.27.0",
4
  "activation_fn": "gelu-approximate",
 
 
5
  "attention_bias": true,
6
  "attention_head_dim": 72,
7
  "attention_type": "default",
@@ -13,11 +15,14 @@
13
  "decay": 0.9999,
14
  "double_self_attention": false,
15
  "dropout": 0.0,
 
 
16
  "enable_uvit": true,
17
  "fake_3d": false,
18
  "in_channels": 12,
19
  "inv_gamma": 1.0,
20
  "min_decay": 0.0,
 
21
  "motion_module_kwargs_even": {
22
  "attention_block_types": [
23
  "Temporal_Self",
@@ -26,6 +31,7 @@
26
  "block_size": 1,
27
  "num_attention_heads": 16,
28
  "num_transformer_block": 1,
 
29
  "temporal_attention_dim_div": 1,
30
  "temporal_position_encoding": true,
31
  "temporal_position_encoding_max_len": 4096
@@ -38,6 +44,7 @@
38
  "block_size": 1,
39
  "num_attention_heads": 16,
40
  "num_transformer_block": 1,
 
41
  "temporal_attention_dim_div": 1,
42
  "temporal_position_encoding": true,
43
  "temporal_position_encoding_max_len": 4096
@@ -57,6 +64,8 @@
57
  "patch_3d": false,
58
  "patch_size": 2,
59
  "power": 0.6666666666666666,
 
 
60
  "sample_size": 64,
61
  "time_patch_size": null,
62
  "time_position_encoding_before_transformer": false,
 
1
  {
2
  "_class_name": "Transformer3DModel",
3
+ "_diffusers_version": "0.30.1",
4
  "activation_fn": "gelu-approximate",
5
+ "add_noise_in_inpaint_model": false,
6
+ "after_norm": false,
7
  "attention_bias": true,
8
  "attention_head_dim": 72,
9
  "attention_type": "default",
 
15
  "decay": 0.9999,
16
  "double_self_attention": false,
17
  "dropout": 0.0,
18
+ "enable_clip_in_inpaint": true,
19
+ "enable_text_attention_mask": true,
20
  "enable_uvit": true,
21
  "fake_3d": false,
22
  "in_channels": 12,
23
  "inv_gamma": 1.0,
24
  "min_decay": 0.0,
25
+ "motion_module_kwargs": null,
26
  "motion_module_kwargs_even": {
27
  "attention_block_types": [
28
  "Temporal_Self",
 
31
  "block_size": 1,
32
  "num_attention_heads": 16,
33
  "num_transformer_block": 1,
34
+ "remove_time_embedding_in_photo": false,
35
  "temporal_attention_dim_div": 1,
36
  "temporal_position_encoding": true,
37
  "temporal_position_encoding_max_len": 4096
 
44
  "block_size": 1,
45
  "num_attention_heads": 16,
46
  "num_transformer_block": 1,
47
+ "remove_time_embedding_in_photo": false,
48
  "temporal_attention_dim_div": 1,
49
  "temporal_position_encoding": true,
50
  "temporal_position_encoding_max_len": 4096
 
64
  "patch_3d": false,
65
  "patch_size": 2,
66
  "power": 0.6666666666666666,
67
+ "qk_norm": false,
68
+ "resize_inpaint_mask_directly": false,
69
  "sample_size": 64,
70
  "time_patch_size": null,
71
  "time_position_encoding_before_transformer": false,
vae/config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "_class_name": "AutoencoderKL",
3
- "_diffusers_version": "0.22.0.dev0",
4
  "act_fn": "silu",
5
  "block_out_channels": [
6
  128,
@@ -8,9 +8,18 @@
8
  512,
9
  512
10
  ],
 
 
 
 
 
 
 
 
 
11
  "down_block_types": [
12
- "SpatialDownBlock3D",
13
- "SpatialTemporalDownBlock3D",
14
  "SpatialTemporalDownBlock3D",
15
  "SpatialTemporalDownBlock3D"
16
  ],
@@ -18,19 +27,31 @@
18
  "in_channels": 3,
19
  "latent_channels": 4,
20
  "layers_per_block": 2,
 
 
 
 
 
 
21
  "norm_num_groups": 32,
 
22
  "out_channels": 3,
23
  "sample_size": 256,
24
  "scaling_factor": 0.18215,
25
  "slice_compression_vae": true,
26
- "use_tiling": true,
27
- "mid_block_attention_type": "3d",
28
- "mini_batch_encoder": 8,
29
- "mini_batch_decoder": 2,
30
  "up_block_types": [
31
- "SpatialUpBlock3D",
32
- "SpatialTemporalUpBlock3D",
33
  "SpatialTemporalUpBlock3D",
34
  "SpatialTemporalUpBlock3D"
35
- ]
 
 
 
 
 
36
  }
 
1
  {
2
+ "_class_name": "AutoencoderKLMagvit",
3
+ "_diffusers_version": "0.30.1",
4
  "act_fn": "silu",
5
  "block_out_channels": [
6
  128,
 
8
  512,
9
  512
10
  ],
11
+ "cache_compression_vae": false,
12
+ "cache_mag_vae": false,
13
+ "ch": 128,
14
+ "ch_mult": [
15
+ 1,
16
+ 2,
17
+ 4,
18
+ 4
19
+ ],
20
  "down_block_types": [
21
+ "SpatialDownBlock3D",
22
+ "SpatialTemporalDownBlock3D",
23
  "SpatialTemporalDownBlock3D",
24
  "SpatialTemporalDownBlock3D"
25
  ],
 
27
  "in_channels": 3,
28
  "latent_channels": 4,
29
  "layers_per_block": 2,
30
+ "mid_block_attention_type": "3d",
31
+ "mid_block_num_attention_heads": 1,
32
+ "mid_block_type": "MidBlock3D",
33
+ "mid_block_use_attention": true,
34
+ "mini_batch_decoder": 2,
35
+ "mini_batch_encoder": 8,
36
  "norm_num_groups": 32,
37
+ "num_attention_heads": 1,
38
  "out_channels": 3,
39
  "sample_size": 256,
40
  "scaling_factor": 0.18215,
41
  "slice_compression_vae": true,
42
+ "slice_mag_vae": true,
43
+ "spatial_group_norm": false,
44
+ "tile_overlap_factor": 0.25,
45
+ "tile_sample_min_size": 384,
46
  "up_block_types": [
47
+ "SpatialUpBlock3D",
48
+ "SpatialTemporalUpBlock3D",
49
  "SpatialTemporalUpBlock3D",
50
  "SpatialTemporalUpBlock3D"
51
+ ],
52
+ "upcast_vae": false,
53
+ "use_gc_blocks": null,
54
+ "use_tiling": true,
55
+ "use_tiling_decoder": false,
56
+ "use_tiling_encoder": false
57
  }