svjack commited on
Commit
a7f3565
·
verified ·
1 Parent(s): 0df1521

Upload 106 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README_pre.md +257 -0
  3. app.py +79 -0
  4. assets/00.gif +0 -0
  5. assets/01.gif +0 -0
  6. assets/02.gif +0 -0
  7. assets/03.gif +0 -0
  8. assets/04.gif +0 -0
  9. assets/05.gif +0 -0
  10. assets/06.gif +0 -0
  11. assets/07.gif +0 -0
  12. assets/08.gif +0 -0
  13. assets/09.gif +0 -0
  14. assets/10.gif +0 -0
  15. assets/11.gif +0 -0
  16. assets/12.gif +0 -0
  17. assets/13.gif +3 -0
  18. assets/72105_388.mp4_00-00.png +0 -0
  19. assets/72105_388.mp4_00-01.png +0 -0
  20. assets/72109_125.mp4_00-00.png +0 -0
  21. assets/72109_125.mp4_00-01.png +0 -0
  22. assets/72110_255.mp4_00-00.png +0 -0
  23. assets/72110_255.mp4_00-01.png +0 -0
  24. assets/74302_1349_frame1.png +0 -0
  25. assets/74302_1349_frame3.png +0 -0
  26. assets/Japan_v2_1_070321_s3_frame1.png +0 -0
  27. assets/Japan_v2_1_070321_s3_frame3.png +0 -0
  28. assets/Japan_v2_2_062266_s2_frame1.png +0 -0
  29. assets/Japan_v2_2_062266_s2_frame3.png +0 -0
  30. assets/frame0001_05.png +0 -0
  31. assets/frame0001_09.png +0 -0
  32. assets/frame0001_10.png +0 -0
  33. assets/frame0001_11.png +0 -0
  34. assets/frame0016_10.png +0 -0
  35. assets/frame0016_11.png +0 -0
  36. checkpoints/tooncrafter_512_interp_v1/model go here.txt +0 -0
  37. configs/inference_512_v1.0.yaml +103 -0
  38. configs/training_1024_v1.0/config.yaml +166 -0
  39. configs/training_1024_v1.0/run.sh +37 -0
  40. configs/training_512_v1.0/config.yaml +166 -0
  41. configs/training_512_v1.0/run.sh +37 -0
  42. genshin_impact_img/AYATO_source.webp +0 -0
  43. genshin_impact_img/AYATO_target.webp +0 -0
  44. genshin_impact_img/ZHONGLI_source.webp +0 -0
  45. genshin_impact_img/ZHONGLI_target.webp +0 -0
  46. genshin_impact_img/ayato_smiling.mp4 +0 -0
  47. genshin_impact_img/zhongli_sitting_down.mp4 +0 -0
  48. gradio_app.py +82 -0
  49. lvdm/__pycache__/basics.cpython-310.pyc +0 -0
  50. lvdm/__pycache__/common.cpython-310.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/13.gif filter=lfs diff=lfs merge=lfs -text
README_pre.md ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## ___***ToonCrafter: Generative Cartoon Interpolation***___
2
+ <!-- ![](./assets/logo_long.png#gh-light-mode-only){: width="50%"} -->
3
+ <!-- ![](./assets/logo_long_dark.png#gh-dark-mode-only=100x20) -->
4
+ <div align="center">
5
+
6
+
7
+
8
+ </div>
9
+
10
+ ## 🔆 Introduction
11
+
12
+ ⚠️ Please check our [disclaimer](#disc) first.
13
+
14
+ 🤗 ToonCrafter can interpolate two cartoon images by leveraging the pre-trained image-to-video diffusion priors. Please check our project page and paper for more information. <br>
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+ ### 1.1 Showcases (512x320)
23
+ <table class="center">
24
+ <tr style="font-weight: bolder;text-align:center;">
25
+ <td>Input starting frame</td>
26
+ <td>Input ending frame</td>
27
+ <td>Generated video</td>
28
+ </tr>
29
+ <tr>
30
+ <td>
31
+ <img src=assets/72109_125.mp4_00-00.png width="250">
32
+ </td>
33
+ <td>
34
+ <img src=assets/72109_125.mp4_00-01.png width="250">
35
+ </td>
36
+ <td>
37
+ <img src=assets/00.gif width="250">
38
+ </td>
39
+ </tr>
40
+
41
+
42
+ <tr>
43
+ <td>
44
+ <img src=assets/Japan_v2_2_062266_s2_frame1.png width="250">
45
+ </td>
46
+ <td>
47
+ <img src=assets/Japan_v2_2_062266_s2_frame3.png width="250">
48
+ </td>
49
+ <td>
50
+ <img src=assets/03.gif width="250">
51
+ </td>
52
+ </tr>
53
+ <tr>
54
+ <td>
55
+ <img src=assets/Japan_v2_1_070321_s3_frame1.png width="250">
56
+ </td>
57
+ <td>
58
+ <img src=assets/Japan_v2_1_070321_s3_frame3.png width="250">
59
+ </td>
60
+ <td>
61
+ <img src=assets/02.gif width="250">
62
+ </td>
63
+ </tr>
64
+ <tr>
65
+ <td>
66
+ <img src=assets/74302_1349_frame1.png width="250">
67
+ </td>
68
+ <td>
69
+ <img src=assets/74302_1349_frame3.png width="250">
70
+ </td>
71
+ <td>
72
+ <img src=assets/01.gif width="250">
73
+ </td>
74
+ </tr>
75
+ </table>
76
+
77
+ ### 1.2 Sparse sketch guidance
78
+ <table class="center">
79
+ <tr style="font-weight: bolder;text-align:center;">
80
+ <td>Input starting frame</td>
81
+ <td>Input ending frame</td>
82
+ <td>Input sketch guidance</td>
83
+ <td>Generated video</td>
84
+ </tr>
85
+ <tr>
86
+ <td>
87
+ <img src=assets/72105_388.mp4_00-00.png width="200">
88
+ </td>
89
+ <td>
90
+ <img src=assets/72105_388.mp4_00-01.png width="200">
91
+ </td>
92
+ <td>
93
+ <img src=assets/06.gif width="200">
94
+ </td>
95
+ <td>
96
+ <img src=assets/07.gif width="200">
97
+ </td>
98
+ </tr>
99
+
100
+ <tr>
101
+ <td>
102
+ <img src=assets/72110_255.mp4_00-00.png width="200">
103
+ </td>
104
+ <td>
105
+ <img src=assets/72110_255.mp4_00-01.png width="200">
106
+ </td>
107
+ <td>
108
+ <img src=assets/12.gif width="200">
109
+ </td>
110
+ <td>
111
+ <img src=assets/13.gif width="200">
112
+ </td>
113
+ </tr>
114
+
115
+
116
+ </table>
117
+
118
+
119
+ ### 2. Applications
120
+ #### 2.1 Cartoon Sketch Interpolation (see project page for more details)
121
+ <table class="center">
122
+ <tr style="font-weight: bolder;text-align:center;">
123
+ <td>Input starting frame</td>
124
+ <td>Input ending frame</td>
125
+ <td>Generated video</td>
126
+ </tr>
127
+
128
+ <tr>
129
+ <td>
130
+ <img src=assets/frame0001_10.png width="250">
131
+ </td>
132
+ <td>
133
+ <img src=assets/frame0016_10.png width="250">
134
+ </td>
135
+ <td>
136
+ <img src=assets/10.gif width="250">
137
+ </td>
138
+ </tr>
139
+
140
+
141
+ <tr>
142
+ <td>
143
+ <img src=assets/frame0001_11.png width="250">
144
+ </td>
145
+ <td>
146
+ <img src=assets/frame0016_11.png width="250">
147
+ </td>
148
+ <td>
149
+ <img src=assets/11.gif width="250">
150
+ </td>
151
+ </tr>
152
+
153
+ </table>
154
+
155
+
156
+ #### 2.2 Reference-based Sketch Colorization
157
+ <table class="center">
158
+ <tr style="font-weight: bolder;text-align:center;">
159
+ <td>Input sketch</td>
160
+ <td>Input reference</td>
161
+ <td>Colorization results</td>
162
+ </tr>
163
+
164
+ <tr>
165
+ <td>
166
+ <img src=assets/04.gif width="250">
167
+ </td>
168
+ <td>
169
+ <img src=assets/frame0001_05.png width="250">
170
+ </td>
171
+ <td>
172
+ <img src=assets/05.gif width="250">
173
+ </td>
174
+ </tr>
175
+
176
+
177
+ <tr>
178
+ <td>
179
+ <img src=assets/08.gif width="250">
180
+ </td>
181
+ <td>
182
+ <img src=assets/frame0001_09.png width="250">
183
+ </td>
184
+ <td>
185
+ <img src=assets/09.gif width="250">
186
+ </td>
187
+ </tr>
188
+
189
+ </table>
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+ ## 📝 Changelog
198
+ - [ ] Add sketch control and colorization function.
199
+ - __[2024.05.29]__: 🔥🔥 Release code and model weights.
200
+ - __[2024.05.28]__: Launch the project page and update the arXiv preprint.
201
+ <br>
202
+
203
+
204
+ ## 🧰 Models
205
+
206
+ |Model|Resolution|GPU Mem. & Inference Time (A100, ddim 50steps)|Checkpoint|
207
+ |:---------|:---------|:--------|:--------|
208
+ |ToonCrafter_512|320x512| TBD (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Doubiiu/ToonCrafter/blob/main/model.ckpt)|
209
+
210
+
211
+ Currently, our ToonCrafter can support generating videos of up to 16 frames with a resolution of 512x320. The inference time can be reduced by using fewer DDIM steps.
212
+
213
+
214
+
215
+ ## ⚙️ Setup
216
+
217
+ ### Install Environment via Anaconda (Recommended)
218
+ ```bash
219
+ conda create -n tooncrafter python=3.8.5
220
+ conda activate tooncrafter
221
+ pip install -r requirements.txt
222
+ ```
223
+
224
+
225
+ ## 💫 Inference
226
+ ### 1. Command line
227
+
228
+ Download pretrained ToonCrafter_512 and put the `model.ckpt` in `checkpoints/tooncrafter_512_interp_v1/model.ckpt`.
229
+ ```bash
230
+ sh scripts/run.sh
231
+ ```
232
+
233
+
234
+ ### 2. Local Gradio demo
235
+
236
+ Download the pretrained model and put it in the corresponding directory according to the previous guidelines.
237
+ ```bash
238
+ python gradio_app.py
239
+ ```
240
+
241
+
242
+
243
+
244
+
245
+
246
+ <!-- ## 🤝 Community Support -->
247
+
248
+
249
+
250
+ <a name="disc"></a>
251
+ ## 📢 Disclaimer
252
+ Calm down. Our framework opens up the era of generative cartoon interpolation, but due to the variaity of generative video prior, the success rate is not guaranteed.
253
+
254
+ ⚠️This is an open-source research exploration, instead of commercial products. It can't meet all your expectations.
255
+
256
+ This project strives to impact the domain of AI-driven video generation positively. Users are granted the freedom to create videos using this tool, but they are expected to comply with local laws and utilize it responsibly. The developers do not assume any responsibility for potential misuse by users.
257
+ ****
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import sys
4
+ import gradio as gr
5
+ from scripts.gradio.i2v_test_application import Image2Video
6
+ sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
7
+
8
+ i2v_examples_interp_512 = [
9
+ ['genshin_impact_img/ZHONGLI_source.webp', 'An anime character sitting down', 50, 3.5, 1.0, 30, 123, 'genshin_impact_img/ZHONGLI_target.webp'],
10
+ ['genshin_impact_img/AYATO_source.webp', 'an anime man smiling', 50, 3.5, 1.0, 30, 123, 'genshin_impact_img/AYATO_target.webp'],
11
+ ['prompts/512_interp/74906_1462_frame1.png', 'walking man', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/74906_1462_frame3.png'],
12
+ ['prompts/512_interp/Japan_v2_2_062266_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 789, 'prompts/512_interp/Japan_v2_2_062266_s2_frame3.png'],
13
+ ['prompts/512_interp/Japan_v2_3_119235_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/Japan_v2_3_119235_s2_frame3.png'],
14
+ ]
15
+
16
+ def dynamicrafter_demo(result_dir='./tmp/', res=512):
17
+ if res == 1024:
18
+ resolution = '576_1024'
19
+ css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px}"""
20
+ elif res == 512:
21
+ resolution = '320_512'
22
+ css = """#input_img {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px} #input_img2 {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px}"""
23
+ elif res == 256:
24
+ resolution = '256_256'
25
+ css = """#input_img {max-width: 256px !important} #output_vid {max-width: 256px; max-height: 256px}"""
26
+ else:
27
+ raise NotImplementedError(f"Unsupported resolution: {res}")
28
+ image2video = Image2Video(result_dir, resolution=resolution)
29
+ with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
30
+ dynamicrafter_iface.title = "Image to Video Converter (with Genshin Impact Demo)" # 添加标题
31
+
32
+ with gr.Tab(label='ToonCrafter_320x512'):
33
+ with gr.Column():
34
+ with gr.Row():
35
+ with gr.Column():
36
+ with gr.Row():
37
+ i2v_input_image = gr.Image(label="Input Image1", elem_id="input_img")
38
+ with gr.Row():
39
+ i2v_input_text = gr.Text(label='Prompts')
40
+ with gr.Row():
41
+ i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=50000, step=1, value=123)
42
+ i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
43
+ i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
44
+ with gr.Row():
45
+ i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
46
+ i2v_motion = gr.Slider(minimum=5, maximum=30, step=1, elem_id="i2v_motion", label="FPS", value=10)
47
+ i2v_end_btn = gr.Button("Generate")
48
+ with gr.Column():
49
+ with gr.Row():
50
+ i2v_input_image2 = gr.Image(label="Input Image2", elem_id="input_img2")
51
+ with gr.Row():
52
+ i2v_output_video = gr.Video(label="Generated Video", elem_id="output_vid", autoplay=True, show_share_button=True)
53
+
54
+ gr.Examples(examples=i2v_examples_interp_512,
55
+ inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
56
+ outputs=[i2v_output_video],
57
+ fn=image2video.get_image,
58
+ cache_examples=False,
59
+ )
60
+ i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
61
+ outputs=[i2v_output_video],
62
+ fn=image2video.get_image
63
+ )
64
+
65
+ return dynamicrafter_iface
66
+
67
+ def get_parser():
68
+ parser = argparse.ArgumentParser()
69
+ return parser
70
+
71
+ if __name__ == "__main__":
72
+ parser = get_parser()
73
+ args = parser.parse_args()
74
+
75
+ result_dir = os.path.join('./', 'results')
76
+ dynamicrafter_iface = dynamicrafter_demo(result_dir)
77
+ dynamicrafter_iface.queue(max_size=12)
78
+ dynamicrafter_iface.launch(max_threads=1, share=True)
79
+ #dynamicrafter_iface.launch(server_name='0.0.0.0', server_port=8080, max_threads=1)
assets/00.gif ADDED
assets/01.gif ADDED
assets/02.gif ADDED
assets/03.gif ADDED
assets/04.gif ADDED
assets/05.gif ADDED
assets/06.gif ADDED
assets/07.gif ADDED
assets/08.gif ADDED
assets/09.gif ADDED
assets/10.gif ADDED
assets/11.gif ADDED
assets/12.gif ADDED
assets/13.gif ADDED

Git LFS Details

  • SHA256: 179af7d265d8790c0ca31a5898f870961b0a738b02d9fd0c991a3a75651cbb56
  • Pointer size: 132 Bytes
  • Size of remote file: 1.03 MB
assets/72105_388.mp4_00-00.png ADDED
assets/72105_388.mp4_00-01.png ADDED
assets/72109_125.mp4_00-00.png ADDED
assets/72109_125.mp4_00-01.png ADDED
assets/72110_255.mp4_00-00.png ADDED
assets/72110_255.mp4_00-01.png ADDED
assets/74302_1349_frame1.png ADDED
assets/74302_1349_frame3.png ADDED
assets/Japan_v2_1_070321_s3_frame1.png ADDED
assets/Japan_v2_1_070321_s3_frame3.png ADDED
assets/Japan_v2_2_062266_s2_frame1.png ADDED
assets/Japan_v2_2_062266_s2_frame3.png ADDED
assets/frame0001_05.png ADDED
assets/frame0001_09.png ADDED
assets/frame0001_10.png ADDED
assets/frame0001_11.png ADDED
assets/frame0016_10.png ADDED
assets/frame0016_11.png ADDED
checkpoints/tooncrafter_512_interp_v1/model go here.txt ADDED
File without changes
configs/inference_512_v1.0.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: lvdm.models.ddpm3d.LatentVisualDiffusion
3
+ params:
4
+ rescale_betas_zero_snr: True
5
+ parameterization: "v"
6
+ linear_start: 0.00085
7
+ linear_end: 0.012
8
+ num_timesteps_cond: 1
9
+ timesteps: 1000
10
+ first_stage_key: video
11
+ cond_stage_key: caption
12
+ cond_stage_trainable: False
13
+ conditioning_key: hybrid
14
+ image_size: [40, 64]
15
+ channels: 4
16
+ scale_by_std: False
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+ uncond_type: 'empty_seq'
20
+ use_dynamic_rescale: true
21
+ base_scale: 0.7
22
+ fps_condition_type: 'fps'
23
+ perframe_ae: True
24
+ loop_video: true
25
+ unet_config:
26
+ target: lvdm.modules.networks.openaimodel3d.UNetModel
27
+ params:
28
+ in_channels: 8
29
+ out_channels: 4
30
+ model_channels: 320
31
+ attention_resolutions:
32
+ - 4
33
+ - 2
34
+ - 1
35
+ num_res_blocks: 2
36
+ channel_mult:
37
+ - 1
38
+ - 2
39
+ - 4
40
+ - 4
41
+ dropout: 0.1
42
+ num_head_channels: 64
43
+ transformer_depth: 1
44
+ context_dim: 1024
45
+ use_linear: true
46
+ use_checkpoint: True
47
+ temporal_conv: True
48
+ temporal_attention: True
49
+ temporal_selfatt_only: true
50
+ use_relative_position: false
51
+ use_causal_attention: False
52
+ temporal_length: 16
53
+ addition_attention: true
54
+ image_cross_attention: true
55
+ default_fs: 24
56
+ fs_condition: true
57
+
58
+ first_stage_config:
59
+ target: lvdm.models.autoencoder.AutoencoderKL_Dualref
60
+ params:
61
+ embed_dim: 4
62
+ monitor: val/rec_loss
63
+ ddconfig:
64
+ double_z: True
65
+ z_channels: 4
66
+ resolution: 256
67
+ in_channels: 3
68
+ out_ch: 3
69
+ ch: 128
70
+ ch_mult:
71
+ - 1
72
+ - 2
73
+ - 4
74
+ - 4
75
+ num_res_blocks: 2
76
+ attn_resolutions: []
77
+ dropout: 0.0
78
+ lossconfig:
79
+ target: torch.nn.Identity
80
+
81
+ cond_stage_config:
82
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
83
+ params:
84
+ freeze: true
85
+ layer: "penultimate"
86
+
87
+ img_cond_stage_config:
88
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
89
+ params:
90
+ freeze: true
91
+
92
+ image_proj_stage_config:
93
+ target: lvdm.modules.encoders.resampler.Resampler
94
+ params:
95
+ dim: 1024
96
+ depth: 4
97
+ dim_head: 64
98
+ heads: 12
99
+ num_queries: 16
100
+ embedding_dim: 1280
101
+ output_dim: 1024
102
+ ff_mult: 4
103
+ video_length: 16
configs/training_1024_v1.0/config.yaml ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ pretrained_checkpoint: checkpoints/dynamicrafter_1024_v1/model.ckpt
3
+ base_learning_rate: 1.0e-05
4
+ scale_lr: False
5
+ target: lvdm.models.ddpm3d.LatentVisualDiffusion
6
+ params:
7
+ rescale_betas_zero_snr: True
8
+ parameterization: "v"
9
+ linear_start: 0.00085
10
+ linear_end: 0.012
11
+ num_timesteps_cond: 1
12
+ log_every_t: 200
13
+ timesteps: 1000
14
+ first_stage_key: video
15
+ cond_stage_key: caption
16
+ cond_stage_trainable: False
17
+ image_proj_model_trainable: True
18
+ conditioning_key: hybrid
19
+ image_size: [72, 128]
20
+ channels: 4
21
+ scale_by_std: False
22
+ scale_factor: 0.18215
23
+ use_ema: False
24
+ uncond_prob: 0.05
25
+ uncond_type: 'empty_seq'
26
+ rand_cond_frame: true
27
+ use_dynamic_rescale: true
28
+ base_scale: 0.3
29
+ fps_condition_type: 'fps'
30
+ perframe_ae: True
31
+
32
+ unet_config:
33
+ target: lvdm.modules.networks.openaimodel3d.UNetModel
34
+ params:
35
+ in_channels: 8
36
+ out_channels: 4
37
+ model_channels: 320
38
+ attention_resolutions:
39
+ - 4
40
+ - 2
41
+ - 1
42
+ num_res_blocks: 2
43
+ channel_mult:
44
+ - 1
45
+ - 2
46
+ - 4
47
+ - 4
48
+ dropout: 0.1
49
+ num_head_channels: 64
50
+ transformer_depth: 1
51
+ context_dim: 1024
52
+ use_linear: true
53
+ use_checkpoint: True
54
+ temporal_conv: True
55
+ temporal_attention: True
56
+ temporal_selfatt_only: true
57
+ use_relative_position: false
58
+ use_causal_attention: False
59
+ temporal_length: 16
60
+ addition_attention: true
61
+ image_cross_attention: true
62
+ default_fs: 10
63
+ fs_condition: true
64
+
65
+ first_stage_config:
66
+ target: lvdm.models.autoencoder.AutoencoderKL
67
+ params:
68
+ embed_dim: 4
69
+ monitor: val/rec_loss
70
+ ddconfig:
71
+ double_z: True
72
+ z_channels: 4
73
+ resolution: 256
74
+ in_channels: 3
75
+ out_ch: 3
76
+ ch: 128
77
+ ch_mult:
78
+ - 1
79
+ - 2
80
+ - 4
81
+ - 4
82
+ num_res_blocks: 2
83
+ attn_resolutions: []
84
+ dropout: 0.0
85
+ lossconfig:
86
+ target: torch.nn.Identity
87
+
88
+ cond_stage_config:
89
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
90
+ params:
91
+ freeze: true
92
+ layer: "penultimate"
93
+
94
+ img_cond_stage_config:
95
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
96
+ params:
97
+ freeze: true
98
+
99
+ image_proj_stage_config:
100
+ target: lvdm.modules.encoders.resampler.Resampler
101
+ params:
102
+ dim: 1024
103
+ depth: 4
104
+ dim_head: 64
105
+ heads: 12
106
+ num_queries: 16
107
+ embedding_dim: 1280
108
+ output_dim: 1024
109
+ ff_mult: 4
110
+ video_length: 16
111
+
112
+ data:
113
+ target: utils_data.DataModuleFromConfig
114
+ params:
115
+ batch_size: 1
116
+ num_workers: 12
117
+ wrap: false
118
+ train:
119
+ target: lvdm.data.webvid.WebVid
120
+ params:
121
+ data_dir: <WebVid10M DATA>
122
+ meta_path: <.csv FILE>
123
+ video_length: 16
124
+ frame_stride: 6
125
+ load_raw_resolution: true
126
+ resolution: [576, 1024]
127
+ spatial_transform: resize_center_crop
128
+ random_fs: true ## if true, we uniformly sample fs with max_fs=frame_stride (above)
129
+
130
+ lightning:
131
+ precision: 16
132
+ # strategy: deepspeed_stage_2
133
+ trainer:
134
+ benchmark: True
135
+ accumulate_grad_batches: 2
136
+ max_steps: 100000
137
+ # logger
138
+ log_every_n_steps: 50
139
+ # val
140
+ val_check_interval: 0.5
141
+ gradient_clip_algorithm: 'norm'
142
+ gradient_clip_val: 0.5
143
+ callbacks:
144
+ model_checkpoint:
145
+ target: pytorch_lightning.callbacks.ModelCheckpoint
146
+ params:
147
+ every_n_train_steps: 9000 #1000
148
+ filename: "{epoch}-{step}"
149
+ save_weights_only: True
150
+ metrics_over_trainsteps_checkpoint:
151
+ target: pytorch_lightning.callbacks.ModelCheckpoint
152
+ params:
153
+ filename: '{epoch}-{step}'
154
+ save_weights_only: True
155
+ every_n_train_steps: 10000 #20000 # 3s/step*2w=
156
+ batch_logger:
157
+ target: callbacks.ImageLogger
158
+ params:
159
+ batch_frequency: 500
160
+ to_local: False
161
+ max_images: 8
162
+ log_images_kwargs:
163
+ ddim_steps: 50
164
+ unconditional_guidance_scale: 7.5
165
+ timestep_spacing: uniform_trailing
166
+ guidance_rescale: 0.7
configs/training_1024_v1.0/run.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NCCL configuration
2
+ # export NCCL_DEBUG=INFO
3
+ # export NCCL_IB_DISABLE=0
4
+ # export NCCL_IB_GID_INDEX=3
5
+ # export NCCL_NET_GDR_LEVEL=3
6
+ # export NCCL_TOPO_FILE=/tmp/topo.txt
7
+
8
+ # args
9
+ name="training_1024_v1.0"
10
+ config_file=configs/${name}/config.yaml
11
+
12
+ # save root dir for logs, checkpoints, tensorboard record, etc.
13
+ save_root="<YOUR_SAVE_ROOT_DIR>"
14
+
15
+ mkdir -p $save_root/$name
16
+
17
+ ## run
18
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
19
+ --nproc_per_node=$HOST_GPU_NUM --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
20
+ ./main/trainer.py \
21
+ --base $config_file \
22
+ --train \
23
+ --name $name \
24
+ --logdir $save_root \
25
+ --devices $HOST_GPU_NUM \
26
+ lightning.trainer.num_nodes=1
27
+
28
+ ## debugging
29
+ # CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch \
30
+ # --nproc_per_node=4 --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
31
+ # ./main/trainer.py \
32
+ # --base $config_file \
33
+ # --train \
34
+ # --name $name \
35
+ # --logdir $save_root \
36
+ # --devices 4 \
37
+ # lightning.trainer.num_nodes=1
configs/training_512_v1.0/config.yaml ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ pretrained_checkpoint: checkpoints/dynamicrafter_512_v1/model.ckpt
3
+ base_learning_rate: 1.0e-05
4
+ scale_lr: False
5
+ target: lvdm.models.ddpm3d.LatentVisualDiffusion
6
+ params:
7
+ rescale_betas_zero_snr: True
8
+ parameterization: "v"
9
+ linear_start: 0.00085
10
+ linear_end: 0.012
11
+ num_timesteps_cond: 1
12
+ log_every_t: 200
13
+ timesteps: 1000
14
+ first_stage_key: video
15
+ cond_stage_key: caption
16
+ cond_stage_trainable: False
17
+ image_proj_model_trainable: True
18
+ conditioning_key: hybrid
19
+ image_size: [40, 64]
20
+ channels: 4
21
+ scale_by_std: False
22
+ scale_factor: 0.18215
23
+ use_ema: False
24
+ uncond_prob: 0.05
25
+ uncond_type: 'empty_seq'
26
+ rand_cond_frame: true
27
+ use_dynamic_rescale: true
28
+ base_scale: 0.7
29
+ fps_condition_type: 'fps'
30
+ perframe_ae: True
31
+
32
+ unet_config:
33
+ target: lvdm.modules.networks.openaimodel3d.UNetModel
34
+ params:
35
+ in_channels: 8
36
+ out_channels: 4
37
+ model_channels: 320
38
+ attention_resolutions:
39
+ - 4
40
+ - 2
41
+ - 1
42
+ num_res_blocks: 2
43
+ channel_mult:
44
+ - 1
45
+ - 2
46
+ - 4
47
+ - 4
48
+ dropout: 0.1
49
+ num_head_channels: 64
50
+ transformer_depth: 1
51
+ context_dim: 1024
52
+ use_linear: true
53
+ use_checkpoint: True
54
+ temporal_conv: True
55
+ temporal_attention: True
56
+ temporal_selfatt_only: true
57
+ use_relative_position: false
58
+ use_causal_attention: False
59
+ temporal_length: 16
60
+ addition_attention: true
61
+ image_cross_attention: true
62
+ default_fs: 10
63
+ fs_condition: true
64
+
65
+ first_stage_config:
66
+ target: lvdm.models.autoencoder.AutoencoderKL
67
+ params:
68
+ embed_dim: 4
69
+ monitor: val/rec_loss
70
+ ddconfig:
71
+ double_z: True
72
+ z_channels: 4
73
+ resolution: 256
74
+ in_channels: 3
75
+ out_ch: 3
76
+ ch: 128
77
+ ch_mult:
78
+ - 1
79
+ - 2
80
+ - 4
81
+ - 4
82
+ num_res_blocks: 2
83
+ attn_resolutions: []
84
+ dropout: 0.0
85
+ lossconfig:
86
+ target: torch.nn.Identity
87
+
88
+ cond_stage_config:
89
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
90
+ params:
91
+ freeze: true
92
+ layer: "penultimate"
93
+
94
+ img_cond_stage_config:
95
+ target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
96
+ params:
97
+ freeze: true
98
+
99
+ image_proj_stage_config:
100
+ target: lvdm.modules.encoders.resampler.Resampler
101
+ params:
102
+ dim: 1024
103
+ depth: 4
104
+ dim_head: 64
105
+ heads: 12
106
+ num_queries: 16
107
+ embedding_dim: 1280
108
+ output_dim: 1024
109
+ ff_mult: 4
110
+ video_length: 16
111
+
112
+ data:
113
+ target: utils_data.DataModuleFromConfig
114
+ params:
115
+ batch_size: 2
116
+ num_workers: 12
117
+ wrap: false
118
+ train:
119
+ target: lvdm.data.webvid.WebVid
120
+ params:
121
+ data_dir: <WebVid10M DATA>
122
+ meta_path: <.csv FILE>
123
+ video_length: 16
124
+ frame_stride: 6
125
+ load_raw_resolution: true
126
+ resolution: [320, 512]
127
+ spatial_transform: resize_center_crop
128
+ random_fs: true ## if true, we uniformly sample fs with max_fs=frame_stride (above)
129
+
130
+ lightning:
131
+ precision: 16
132
+ # strategy: deepspeed_stage_2
133
+ trainer:
134
+ benchmark: True
135
+ accumulate_grad_batches: 2
136
+ max_steps: 100000
137
+ # logger
138
+ log_every_n_steps: 50
139
+ # val
140
+ val_check_interval: 0.5
141
+ gradient_clip_algorithm: 'norm'
142
+ gradient_clip_val: 0.5
143
+ callbacks:
144
+ model_checkpoint:
145
+ target: pytorch_lightning.callbacks.ModelCheckpoint
146
+ params:
147
+ every_n_train_steps: 9000 #1000
148
+ filename: "{epoch}-{step}"
149
+ save_weights_only: True
150
+ metrics_over_trainsteps_checkpoint:
151
+ target: pytorch_lightning.callbacks.ModelCheckpoint
152
+ params:
153
+ filename: '{epoch}-{step}'
154
+ save_weights_only: True
155
+ every_n_train_steps: 10000 #20000 # 3s/step*2w=
156
+ batch_logger:
157
+ target: callbacks.ImageLogger
158
+ params:
159
+ batch_frequency: 500
160
+ to_local: False
161
+ max_images: 8
162
+ log_images_kwargs:
163
+ ddim_steps: 50
164
+ unconditional_guidance_scale: 7.5
165
+ timestep_spacing: uniform_trailing
166
+ guidance_rescale: 0.7
configs/training_512_v1.0/run.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NCCL configuration
2
+ # export NCCL_DEBUG=INFO
3
+ # export NCCL_IB_DISABLE=0
4
+ # export NCCL_IB_GID_INDEX=3
5
+ # export NCCL_NET_GDR_LEVEL=3
6
+ # export NCCL_TOPO_FILE=/tmp/topo.txt
7
+
8
+ # args
9
+ name="training_512_v1.0"
10
+ config_file=configs/${name}/config.yaml
11
+
12
+ # save root dir for logs, checkpoints, tensorboard record, etc.
13
+ save_root="<YOUR_SAVE_ROOT_DIR>"
14
+
15
+ mkdir -p $save_root/$name
16
+
17
+ ## run
18
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
19
+ --nproc_per_node=$HOST_GPU_NUM --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
20
+ ./main/trainer.py \
21
+ --base $config_file \
22
+ --train \
23
+ --name $name \
24
+ --logdir $save_root \
25
+ --devices $HOST_GPU_NUM \
26
+ lightning.trainer.num_nodes=1
27
+
28
+ ## debugging
29
+ # CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch \
30
+ # --nproc_per_node=4 --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
31
+ # ./main/trainer.py \
32
+ # --base $config_file \
33
+ # --train \
34
+ # --name $name \
35
+ # --logdir $save_root \
36
+ # --devices 4 \
37
+ # lightning.trainer.num_nodes=1
genshin_impact_img/AYATO_source.webp ADDED
genshin_impact_img/AYATO_target.webp ADDED
genshin_impact_img/ZHONGLI_source.webp ADDED
genshin_impact_img/ZHONGLI_target.webp ADDED
genshin_impact_img/ayato_smiling.mp4 ADDED
Binary file (610 kB). View file
 
genshin_impact_img/zhongli_sitting_down.mp4 ADDED
Binary file (621 kB). View file
 
gradio_app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, argparse
2
+ import sys
3
+ import gradio as gr
4
+ from scripts.gradio.i2v_test_application import Image2Video
5
+ sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
6
+
7
+
8
+ i2v_examples_interp_512 = [
9
+ ['prompts/512_interp/74906_1462_frame1.png', 'walking man', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/74906_1462_frame3.png'],
10
+ ['prompts/512_interp/Japan_v2_2_062266_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 789, 'prompts/512_interp/Japan_v2_2_062266_s2_frame3.png'],
11
+ ['prompts/512_interp/Japan_v2_3_119235_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/Japan_v2_3_119235_s2_frame3.png'],
12
+ ]
13
+
14
+
15
+
16
+
17
+ def dynamicrafter_demo(result_dir='./tmp/', res=512):
18
+ if res == 1024:
19
+ resolution = '576_1024'
20
+ css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px}"""
21
+ elif res == 512:
22
+ resolution = '320_512'
23
+ css = """#input_img {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px} #input_img2 {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px}"""
24
+ elif res == 256:
25
+ resolution = '256_256'
26
+ css = """#input_img {max-width: 256px !important} #output_vid {max-width: 256px; max-height: 256px}"""
27
+ else:
28
+ raise NotImplementedError(f"Unsupported resolution: {res}")
29
+ image2video = Image2Video(result_dir, resolution=resolution)
30
+ with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
31
+
32
+
33
+
34
+ with gr.Tab(label='ToonCrafter_320x512'):
35
+ with gr.Column():
36
+ with gr.Row():
37
+ with gr.Column():
38
+ with gr.Row():
39
+ i2v_input_image = gr.Image(label="Input Image1",elem_id="input_img")
40
+ with gr.Row():
41
+ i2v_input_text = gr.Text(label='Prompts')
42
+ with gr.Row():
43
+ i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=50000, step=1, value=123)
44
+ i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
45
+ i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
46
+ with gr.Row():
47
+ i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
48
+ i2v_motion = gr.Slider(minimum=5, maximum=30, step=1, elem_id="i2v_motion", label="FPS", value=10)
49
+ i2v_end_btn = gr.Button("Generate")
50
+ with gr.Column():
51
+ with gr.Row():
52
+ i2v_input_image2 = gr.Image(label="Input Image2",elem_id="input_img2")
53
+ with gr.Row():
54
+ i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
55
+
56
+ gr.Examples(examples=i2v_examples_interp_512,
57
+ inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
58
+ outputs=[i2v_output_video],
59
+ fn = image2video.get_image,
60
+ cache_examples=False,
61
+ )
62
+ i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
63
+ outputs=[i2v_output_video],
64
+ fn = image2video.get_image
65
+ )
66
+
67
+
68
+ return dynamicrafter_iface
69
+
70
+ def get_parser():
71
+ parser = argparse.ArgumentParser()
72
+ return parser
73
+
74
+ if __name__ == "__main__":
75
+ parser = get_parser()
76
+ args = parser.parse_args()
77
+
78
+ result_dir = os.path.join('./', 'results')
79
+ dynamicrafter_iface = dynamicrafter_demo(result_dir)
80
+ dynamicrafter_iface.queue(max_size=12)
81
+ dynamicrafter_iface.launch(max_threads=1, share = True)
82
+ #dynamicrafter_iface.launch(server_name='0.0.0.0', server_port=8080, max_threads=1)
lvdm/__pycache__/basics.cpython-310.pyc ADDED
Binary file (3.12 kB). View file
 
lvdm/__pycache__/common.cpython-310.pyc ADDED
Binary file (4.52 kB). View file