Spaces:

svjack
/

ToonCrafter-fp16

Runtime error

App Files Files Community

svjack commited on Nov 17, 2024

Commit

a7f3565

verified ·

1 Parent(s): 0df1521

Upload 106 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README_pre.md +257 -0
app.py +79 -0
assets/00.gif +0 -0
assets/01.gif +0 -0
assets/02.gif +0 -0
assets/03.gif +0 -0
assets/04.gif +0 -0
assets/05.gif +0 -0
assets/06.gif +0 -0
assets/07.gif +0 -0
assets/08.gif +0 -0
assets/09.gif +0 -0
assets/10.gif +0 -0
assets/11.gif +0 -0
assets/12.gif +0 -0
assets/13.gif +3 -0
assets/72105_388.mp4_00-00.png +0 -0
assets/72105_388.mp4_00-01.png +0 -0
assets/72109_125.mp4_00-00.png +0 -0
assets/72109_125.mp4_00-01.png +0 -0
assets/72110_255.mp4_00-00.png +0 -0
assets/72110_255.mp4_00-01.png +0 -0
assets/74302_1349_frame1.png +0 -0
assets/74302_1349_frame3.png +0 -0
assets/Japan_v2_1_070321_s3_frame1.png +0 -0
assets/Japan_v2_1_070321_s3_frame3.png +0 -0
assets/Japan_v2_2_062266_s2_frame1.png +0 -0
assets/Japan_v2_2_062266_s2_frame3.png +0 -0
assets/frame0001_05.png +0 -0
assets/frame0001_09.png +0 -0
assets/frame0001_10.png +0 -0
assets/frame0001_11.png +0 -0
assets/frame0016_10.png +0 -0
assets/frame0016_11.png +0 -0
checkpoints/tooncrafter_512_interp_v1/model go here.txt +0 -0
configs/inference_512_v1.0.yaml +103 -0
configs/training_1024_v1.0/config.yaml +166 -0
configs/training_1024_v1.0/run.sh +37 -0
configs/training_512_v1.0/config.yaml +166 -0
configs/training_512_v1.0/run.sh +37 -0
genshin_impact_img/AYATO_source.webp +0 -0
genshin_impact_img/AYATO_target.webp +0 -0
genshin_impact_img/ZHONGLI_source.webp +0 -0
genshin_impact_img/ZHONGLI_target.webp +0 -0
genshin_impact_img/ayato_smiling.mp4 +0 -0
genshin_impact_img/zhongli_sitting_down.mp4 +0 -0
gradio_app.py +82 -0
lvdm/__pycache__/basics.cpython-310.pyc +0 -0
lvdm/__pycache__/common.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/13.gif filter=lfs diff=lfs merge=lfs -text

README_pre.md ADDED Viewed

	@@ -0,0 +1,257 @@

+## ___***ToonCrafter: Generative Cartoon Interpolation***___
+<!-- ![](./assets/logo_long.png#gh-light-mode-only){: width="50%"} -->
+<!-- ![](./assets/logo_long_dark.png#gh-dark-mode-only=100x20) -->
+<div align="center">
+</div>
+## 🔆 Introduction
+⚠️ Please check our [disclaimer](#disc) first.
+🤗 ToonCrafter can interpolate two cartoon images by leveraging the pre-trained image-to-video diffusion priors. Please check our project page and paper for more information. <br>
+### 1.1 Showcases (512x320)
+<table class="center">
+    <tr style="font-weight: bolder;text-align:center;">
+        <td>Input starting frame</td>
+        <td>Input ending frame</td>
+        <td>Generated video</td>
+    </tr>
+  <tr>
+  <td>
+    <img src=assets/72109_125.mp4_00-00.png width="250">
+  </td>
+  <td>
+    <img src=assets/72109_125.mp4_00-01.png width="250">
+  </td>
+  <td>
+    <img src=assets/00.gif width="250">
+  </td>
+  </tr>
+   <tr>
+  <td>
+    <img src=assets/Japan_v2_2_062266_s2_frame1.png width="250">
+  </td>
+  <td>
+    <img src=assets/Japan_v2_2_062266_s2_frame3.png width="250">
+  </td>
+  <td>
+    <img src=assets/03.gif width="250">
+  </td>
+  </tr>
+  <tr>
+  <td>
+    <img src=assets/Japan_v2_1_070321_s3_frame1.png width="250">
+  </td>
+  <td>
+    <img src=assets/Japan_v2_1_070321_s3_frame3.png width="250">
+  </td>
+  <td>
+    <img src=assets/02.gif width="250">
+  </td>
+  </tr>
+  <tr>
+  <td>
+    <img src=assets/74302_1349_frame1.png width="250">
+  </td>
+  <td>
+    <img src=assets/74302_1349_frame3.png width="250">
+  </td>
+  <td>
+    <img src=assets/01.gif width="250">
+  </td>
+  </tr>
+</table>
+### 1.2 Sparse sketch guidance
+<table class="center">
+    <tr style="font-weight: bolder;text-align:center;">
+        <td>Input starting frame</td>
+        <td>Input ending frame</td>
+        <td>Input sketch guidance</td>
+        <td>Generated video</td>
+    </tr>
+  <tr>
+  <td>
+    <img src=assets/72105_388.mp4_00-00.png width="200">
+  </td>
+  <td>
+    <img src=assets/72105_388.mp4_00-01.png width="200">
+  </td>
+  <td>
+    <img src=assets/06.gif width="200">
+  </td>
+   <td>
+    <img src=assets/07.gif width="200">
+  </td>
+  </tr>
+  <tr>
+  <td>
+    <img src=assets/72110_255.mp4_00-00.png width="200">
+  </td>
+  <td>
+    <img src=assets/72110_255.mp4_00-01.png width="200">
+  </td>
+  <td>
+    <img src=assets/12.gif width="200">
+  </td>
+   <td>
+    <img src=assets/13.gif width="200">
+  </td>
+  </tr>
+</table>
+### 2. Applications
+#### 2.1 Cartoon Sketch Interpolation (see project page for more details)
+<table class="center">
+    <tr style="font-weight: bolder;text-align:center;">
+        <td>Input starting frame</td>
+        <td>Input ending frame</td>
+        <td>Generated video</td>
+    </tr>
+  <tr>
+  <td>
+    <img src=assets/frame0001_10.png width="250">
+  </td>
+  <td>
+    <img src=assets/frame0016_10.png width="250">
+  </td>
+  <td>
+    <img src=assets/10.gif width="250">
+  </td>
+  </tr>
+   <tr>
+  <td>
+    <img src=assets/frame0001_11.png width="250">
+  </td>
+  <td>
+    <img src=assets/frame0016_11.png width="250">
+  </td>
+  <td>
+    <img src=assets/11.gif width="250">
+  </td>
+  </tr>
+</table>
+#### 2.2 Reference-based Sketch Colorization
+<table class="center">
+    <tr style="font-weight: bolder;text-align:center;">
+        <td>Input sketch</td>
+        <td>Input reference</td>
+        <td>Colorization results</td>
+    </tr>
+  <tr>
+  <td>
+    <img src=assets/04.gif width="250">
+  </td>
+  <td>
+    <img src=assets/frame0001_05.png width="250">
+  </td>
+  <td>
+    <img src=assets/05.gif width="250">
+  </td>
+  </tr>
+   <tr>
+  <td>
+    <img src=assets/08.gif width="250">
+  </td>
+  <td>
+    <img src=assets/frame0001_09.png width="250">
+  </td>
+  <td>
+    <img src=assets/09.gif width="250">
+  </td>
+  </tr>
+</table>
+## 📝 Changelog
+- [ ] Add sketch control and colorization function.
+- __[2024.05.29]__: 🔥🔥 Release code and model weights.
+- __[2024.05.28]__: Launch the project page and update the arXiv preprint.
+<br>
+## 🧰 Models
+|Model|Resolution|GPU Mem. & Inference Time (A100, ddim 50steps)|Checkpoint|
+|:---------|:---------|:--------|:--------|
+|ToonCrafter_512|320x512| TBD (`perframe_ae=True`)|[Hugging Face](https://huggingface.co/Doubiiu/ToonCrafter/blob/main/model.ckpt)|
+Currently, our ToonCrafter can support generating videos of up to 16 frames with a resolution of 512x320. The inference time can be reduced by using fewer DDIM steps.
+## ⚙️ Setup
+### Install Environment via Anaconda (Recommended)
+```bash
+conda create -n tooncrafter python=3.8.5
+conda activate tooncrafter
+pip install -r requirements.txt
+```
+## 💫 Inference
+### 1. Command line
+Download pretrained ToonCrafter_512 and put the `model.ckpt` in `checkpoints/tooncrafter_512_interp_v1/model.ckpt`.
+```bash
+  sh scripts/run.sh
+```
+### 2. Local Gradio demo
+Download the pretrained model and put it in the corresponding directory according to the previous guidelines.
+```bash
+  python gradio_app.py
+```
+<!-- ## 🤝 Community Support -->
+<a name="disc"></a>
+## 📢 Disclaimer
+Calm down. Our framework opens up the era of generative cartoon interpolation, but due to the variaity of generative video prior, the success rate is not guaranteed.
+⚠️This is an open-source research exploration, instead of commercial products. It can't meet all your expectations.
+This project strives to impact the domain of AI-driven video generation positively. Users are granted the freedom to create videos using this tool, but they are expected to comply with local laws and utilize it responsibly. The developers do not assume any responsibility for potential misuse by users.
+****

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import argparse
+import sys
+import gradio as gr
+from scripts.gradio.i2v_test_application import Image2Video
+sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
+i2v_examples_interp_512 = [
+    ['genshin_impact_img/ZHONGLI_source.webp', 'An anime character sitting down', 50, 3.5, 1.0, 30, 123, 'genshin_impact_img/ZHONGLI_target.webp'],
+    ['genshin_impact_img/AYATO_source.webp', 'an anime man smiling', 50, 3.5, 1.0, 30, 123, 'genshin_impact_img/AYATO_target.webp'],
+    ['prompts/512_interp/74906_1462_frame1.png', 'walking man', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/74906_1462_frame3.png'],
+    ['prompts/512_interp/Japan_v2_2_062266_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 789, 'prompts/512_interp/Japan_v2_2_062266_s2_frame3.png'],
+    ['prompts/512_interp/Japan_v2_3_119235_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/Japan_v2_3_119235_s2_frame3.png'],
+]
+def dynamicrafter_demo(result_dir='./tmp/', res=512):
+    if res == 1024:
+        resolution = '576_1024'
+        css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px}"""
+    elif res == 512:
+        resolution = '320_512'
+        css = """#input_img {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px} #input_img2 {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px}"""
+    elif res == 256:
+        resolution = '256_256'
+        css = """#input_img {max-width: 256px !important} #output_vid {max-width: 256px; max-height: 256px}"""
+    else:
+        raise NotImplementedError(f"Unsupported resolution: {res}")
+    image2video = Image2Video(result_dir, resolution=resolution)
+    with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
+        dynamicrafter_iface.title = "Image to Video Converter (with Genshin Impact Demo)"  # 添加标题
+        with gr.Tab(label='ToonCrafter_320x512'):
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            i2v_input_image = gr.Image(label="Input Image1", elem_id="input_img")
+                        with gr.Row():
+                            i2v_input_text = gr.Text(label='Prompts')
+                        with gr.Row():
+                            i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=50000, step=1, value=123)
+                            i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
+                            i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
+                        with gr.Row():
+                            i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
+                            i2v_motion = gr.Slider(minimum=5, maximum=30, step=1, elem_id="i2v_motion", label="FPS", value=10)
+                        i2v_end_btn = gr.Button("Generate")
+                    with gr.Column():
+                        with gr.Row():
+                            i2v_input_image2 = gr.Image(label="Input Image2", elem_id="input_img2")
+                        with gr.Row():
+                            i2v_output_video = gr.Video(label="Generated Video", elem_id="output_vid", autoplay=True, show_share_button=True)
+                gr.Examples(examples=i2v_examples_interp_512,
+                            inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
+                            outputs=[i2v_output_video],
+                            fn=image2video.get_image,
+                            cache_examples=False,
+                )
+            i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
+                              outputs=[i2v_output_video],
+                              fn=image2video.get_image
+            )
+    return dynamicrafter_iface
+def get_parser():
+    parser = argparse.ArgumentParser()
+    return parser
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    result_dir = os.path.join('./', 'results')
+    dynamicrafter_iface = dynamicrafter_demo(result_dir)
+    dynamicrafter_iface.queue(max_size=12)
+    dynamicrafter_iface.launch(max_threads=1, share=True)
+    #dynamicrafter_iface.launch(server_name='0.0.0.0', server_port=8080, max_threads=1)

assets/00.gif ADDED Viewed

assets/01.gif ADDED Viewed

assets/02.gif ADDED Viewed

assets/03.gif ADDED Viewed

assets/04.gif ADDED Viewed

assets/05.gif ADDED Viewed

assets/06.gif ADDED Viewed

assets/07.gif ADDED Viewed

assets/08.gif ADDED Viewed

assets/09.gif ADDED Viewed

assets/10.gif ADDED Viewed

assets/11.gif ADDED Viewed

assets/12.gif ADDED Viewed

assets/13.gif ADDED Viewed

Git LFS Details

SHA256: 179af7d265d8790c0ca31a5898f870961b0a738b02d9fd0c991a3a75651cbb56
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

assets/72105_388.mp4_00-00.png ADDED Viewed

assets/72105_388.mp4_00-01.png ADDED Viewed

assets/72109_125.mp4_00-00.png ADDED Viewed

assets/72109_125.mp4_00-01.png ADDED Viewed

assets/72110_255.mp4_00-00.png ADDED Viewed

assets/72110_255.mp4_00-01.png ADDED Viewed

assets/74302_1349_frame1.png ADDED Viewed

assets/74302_1349_frame3.png ADDED Viewed

assets/Japan_v2_1_070321_s3_frame1.png ADDED Viewed

assets/Japan_v2_1_070321_s3_frame3.png ADDED Viewed

assets/Japan_v2_2_062266_s2_frame1.png ADDED Viewed

assets/Japan_v2_2_062266_s2_frame3.png ADDED Viewed

assets/frame0001_05.png ADDED Viewed

assets/frame0001_09.png ADDED Viewed

assets/frame0001_10.png ADDED Viewed

assets/frame0001_11.png ADDED Viewed

assets/frame0016_10.png ADDED Viewed

assets/frame0016_11.png ADDED Viewed

checkpoints/tooncrafter_512_interp_v1/model go here.txt ADDED Viewed

File without changes

configs/inference_512_v1.0.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+model:
+  target: lvdm.models.ddpm3d.LatentVisualDiffusion
+  params:
+    rescale_betas_zero_snr: True
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: False
+    conditioning_key: hybrid
+    image_size: [40, 64]
+    channels: 4
+    scale_by_std: False
+    scale_factor: 0.18215
+    use_ema: False
+    uncond_type: 'empty_seq'
+    use_dynamic_rescale: true
+    base_scale: 0.7
+    fps_condition_type: 'fps'
+    perframe_ae: True
+    loop_video: true
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        dropout: 0.1
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: True
+        temporal_conv: True
+        temporal_attention: True
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: False
+        temporal_length: 16
+        addition_attention: true
+        image_cross_attention: true
+        default_fs: 24
+        fs_condition: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL_Dualref
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: "penultimate"
+    img_cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
+      params:
+        freeze: true
+    image_proj_stage_config:
+      target: lvdm.modules.encoders.resampler.Resampler
+      params:
+        dim: 1024
+        depth: 4
+        dim_head: 64
+        heads: 12
+        num_queries: 16
+        embedding_dim: 1280
+        output_dim: 1024
+        ff_mult: 4
+        video_length: 16

configs/training_1024_v1.0/config.yaml ADDED Viewed

	@@ -0,0 +1,166 @@

+model:
+  pretrained_checkpoint: checkpoints/dynamicrafter_1024_v1/model.ckpt
+  base_learning_rate: 1.0e-05
+  scale_lr: False
+  target: lvdm.models.ddpm3d.LatentVisualDiffusion
+  params:
+    rescale_betas_zero_snr: True
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: False
+    image_proj_model_trainable: True
+    conditioning_key: hybrid
+    image_size: [72, 128]
+    channels: 4
+    scale_by_std: False
+    scale_factor: 0.18215
+    use_ema: False
+    uncond_prob: 0.05
+    uncond_type: 'empty_seq'
+    rand_cond_frame: true
+    use_dynamic_rescale: true
+    base_scale: 0.3
+    fps_condition_type: 'fps'
+    perframe_ae: True
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        dropout: 0.1
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: True
+        temporal_conv: True
+        temporal_attention: True
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: False
+        temporal_length: 16
+        addition_attention: true
+        image_cross_attention: true
+        default_fs: 10
+        fs_condition: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: "penultimate"
+    img_cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
+      params:
+        freeze: true
+    image_proj_stage_config:
+      target: lvdm.modules.encoders.resampler.Resampler
+      params:
+        dim: 1024
+        depth: 4
+        dim_head: 64
+        heads: 12
+        num_queries: 16
+        embedding_dim: 1280
+        output_dim: 1024
+        ff_mult: 4
+        video_length: 16
+data:
+  target: utils_data.DataModuleFromConfig
+  params:
+    batch_size: 1
+    num_workers: 12
+    wrap: false
+    train:
+      target: lvdm.data.webvid.WebVid
+      params:
+        data_dir: <WebVid10M DATA>
+        meta_path: <.csv FILE>
+        video_length: 16
+        frame_stride: 6
+        load_raw_resolution: true
+        resolution: [576, 1024]
+        spatial_transform: resize_center_crop
+        random_fs: true  ## if true, we uniformly sample fs with max_fs=frame_stride (above)
+lightning:
+  precision: 16
+  # strategy: deepspeed_stage_2
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
+    max_steps: 100000
+    # logger
+    log_every_n_steps: 50
+    # val
+    val_check_interval: 0.5
+    gradient_clip_algorithm: 'norm'
+    gradient_clip_val: 0.5
+  callbacks:
+    model_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        every_n_train_steps: 9000 #1000
+        filename: "{epoch}-{step}"
+        save_weights_only: True
+    metrics_over_trainsteps_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        filename: '{epoch}-{step}'
+        save_weights_only: True
+        every_n_train_steps: 10000 #20000 # 3s/step*2w=
+    batch_logger:
+      target: callbacks.ImageLogger
+      params:
+        batch_frequency: 500
+        to_local: False
+        max_images: 8
+        log_images_kwargs:
+          ddim_steps: 50
+          unconditional_guidance_scale: 7.5
+          timestep_spacing: uniform_trailing
+          guidance_rescale: 0.7

configs/training_1024_v1.0/run.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+# NCCL configuration
+# export NCCL_DEBUG=INFO
+# export NCCL_IB_DISABLE=0
+# export NCCL_IB_GID_INDEX=3
+# export NCCL_NET_GDR_LEVEL=3
+# export NCCL_TOPO_FILE=/tmp/topo.txt
+# args
+name="training_1024_v1.0"
+config_file=configs/${name}/config.yaml
+# save root dir for logs, checkpoints, tensorboard record, etc.
+save_root="<YOUR_SAVE_ROOT_DIR>"
+mkdir -p $save_root/$name
+## run
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
+--nproc_per_node=$HOST_GPU_NUM --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
+./main/trainer.py \
+--base $config_file \
+--train \
+--name $name \
+--logdir $save_root \
+--devices $HOST_GPU_NUM \
+lightning.trainer.num_nodes=1
+## debugging
+# CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch \
+# --nproc_per_node=4 --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
+# ./main/trainer.py \
+# --base $config_file \
+# --train \
+# --name $name \
+# --logdir $save_root \
+# --devices 4 \
+# lightning.trainer.num_nodes=1

configs/training_512_v1.0/config.yaml ADDED Viewed

	@@ -0,0 +1,166 @@

+model:
+  pretrained_checkpoint: checkpoints/dynamicrafter_512_v1/model.ckpt
+  base_learning_rate: 1.0e-05
+  scale_lr: False
+  target: lvdm.models.ddpm3d.LatentVisualDiffusion
+  params:
+    rescale_betas_zero_snr: True
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: False
+    image_proj_model_trainable: True
+    conditioning_key: hybrid
+    image_size: [40, 64]
+    channels: 4
+    scale_by_std: False
+    scale_factor: 0.18215
+    use_ema: False
+    uncond_prob: 0.05
+    uncond_type: 'empty_seq'
+    rand_cond_frame: true
+    use_dynamic_rescale: true
+    base_scale: 0.7
+    fps_condition_type: 'fps'
+    perframe_ae: True
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        dropout: 0.1
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: True
+        temporal_conv: True
+        temporal_attention: True
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: False
+        temporal_length: 16
+        addition_attention: true
+        image_cross_attention: true
+        default_fs: 10
+        fs_condition: true
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: "penultimate"
+    img_cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
+      params:
+        freeze: true
+    image_proj_stage_config:
+      target: lvdm.modules.encoders.resampler.Resampler
+      params:
+        dim: 1024
+        depth: 4
+        dim_head: 64
+        heads: 12
+        num_queries: 16
+        embedding_dim: 1280
+        output_dim: 1024
+        ff_mult: 4
+        video_length: 16
+data:
+  target: utils_data.DataModuleFromConfig
+  params:
+    batch_size: 2
+    num_workers: 12
+    wrap: false
+    train:
+      target: lvdm.data.webvid.WebVid
+      params:
+        data_dir: <WebVid10M DATA>
+        meta_path: <.csv FILE>
+        video_length: 16
+        frame_stride: 6
+        load_raw_resolution: true
+        resolution: [320, 512]
+        spatial_transform: resize_center_crop
+        random_fs: true  ## if true, we uniformly sample fs with max_fs=frame_stride (above)
+lightning:
+  precision: 16
+  # strategy: deepspeed_stage_2
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
+    max_steps: 100000
+    # logger
+    log_every_n_steps: 50
+    # val
+    val_check_interval: 0.5
+    gradient_clip_algorithm: 'norm'
+    gradient_clip_val: 0.5
+  callbacks:
+    model_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        every_n_train_steps: 9000 #1000
+        filename: "{epoch}-{step}"
+        save_weights_only: True
+    metrics_over_trainsteps_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        filename: '{epoch}-{step}'
+        save_weights_only: True
+        every_n_train_steps: 10000 #20000 # 3s/step*2w=
+    batch_logger:
+      target: callbacks.ImageLogger
+      params:
+        batch_frequency: 500
+        to_local: False
+        max_images: 8
+        log_images_kwargs:
+          ddim_steps: 50
+          unconditional_guidance_scale: 7.5
+          timestep_spacing: uniform_trailing
+          guidance_rescale: 0.7

configs/training_512_v1.0/run.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+# NCCL configuration
+# export NCCL_DEBUG=INFO
+# export NCCL_IB_DISABLE=0
+# export NCCL_IB_GID_INDEX=3
+# export NCCL_NET_GDR_LEVEL=3
+# export NCCL_TOPO_FILE=/tmp/topo.txt
+# args
+name="training_512_v1.0"
+config_file=configs/${name}/config.yaml
+# save root dir for logs, checkpoints, tensorboard record, etc.
+save_root="<YOUR_SAVE_ROOT_DIR>"
+mkdir -p $save_root/$name
+## run
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
+--nproc_per_node=$HOST_GPU_NUM --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
+./main/trainer.py \
+--base $config_file \
+--train \
+--name $name \
+--logdir $save_root \
+--devices $HOST_GPU_NUM \
+lightning.trainer.num_nodes=1
+## debugging
+# CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m torch.distributed.launch \
+# --nproc_per_node=4 --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
+# ./main/trainer.py \
+# --base $config_file \
+# --train \
+# --name $name \
+# --logdir $save_root \
+# --devices 4 \
+# lightning.trainer.num_nodes=1

genshin_impact_img/AYATO_source.webp ADDED Viewed

genshin_impact_img/AYATO_target.webp ADDED Viewed

genshin_impact_img/ZHONGLI_source.webp ADDED Viewed

genshin_impact_img/ZHONGLI_target.webp ADDED Viewed

genshin_impact_img/ayato_smiling.mp4 ADDED Viewed

Binary file (610 kB). View file

genshin_impact_img/zhongli_sitting_down.mp4 ADDED Viewed

Binary file (621 kB). View file

gradio_app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os, argparse
+import sys
+import gradio as gr
+from scripts.gradio.i2v_test_application import Image2Video
+sys.path.insert(1, os.path.join(sys.path[0], 'lvdm'))
+i2v_examples_interp_512 = [
+    ['prompts/512_interp/74906_1462_frame1.png', 'walking man', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/74906_1462_frame3.png'],
+    ['prompts/512_interp/Japan_v2_2_062266_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 789, 'prompts/512_interp/Japan_v2_2_062266_s2_frame3.png'],
+    ['prompts/512_interp/Japan_v2_3_119235_s2_frame1.png', 'an anime scene', 50, 7.5, 1.0, 10, 123, 'prompts/512_interp/Japan_v2_3_119235_s2_frame3.png'],
+]
+def dynamicrafter_demo(result_dir='./tmp/', res=512):
+    if res == 1024:
+        resolution = '576_1024'
+        css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px}"""
+    elif res == 512:
+        resolution = '320_512'
+        css = """#input_img {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px} #input_img2 {max-width: 512px !important} #output_vid {max-width: 512px; max-height: 320px}"""
+    elif res == 256:
+        resolution = '256_256'
+        css = """#input_img {max-width: 256px !important} #output_vid {max-width: 256px; max-height: 256px}"""
+    else:
+        raise NotImplementedError(f"Unsupported resolution: {res}")
+    image2video = Image2Video(result_dir, resolution=resolution)
+    with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
+        with gr.Tab(label='ToonCrafter_320x512'):
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            i2v_input_image = gr.Image(label="Input Image1",elem_id="input_img")
+                        with gr.Row():
+                            i2v_input_text = gr.Text(label='Prompts')
+                        with gr.Row():
+                            i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=50000, step=1, value=123)
+                            i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
+                            i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
+                        with gr.Row():
+                            i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
+                            i2v_motion = gr.Slider(minimum=5, maximum=30, step=1, elem_id="i2v_motion", label="FPS", value=10)
+                        i2v_end_btn = gr.Button("Generate")
+                    with gr.Column():
+                        with gr.Row():
+                            i2v_input_image2 = gr.Image(label="Input Image2",elem_id="input_img2")
+                        with gr.Row():
+                            i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
+                gr.Examples(examples=i2v_examples_interp_512,
+                            inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
+                            outputs=[i2v_output_video],
+                            fn = image2video.get_image,
+                            cache_examples=False,
+                )
+            i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_input_image2],
+                            outputs=[i2v_output_video],
+                            fn = image2video.get_image
+            )
+    return dynamicrafter_iface
+def get_parser():
+    parser = argparse.ArgumentParser()
+    return parser
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    result_dir = os.path.join('./', 'results')
+    dynamicrafter_iface = dynamicrafter_demo(result_dir)
+    dynamicrafter_iface.queue(max_size=12)
+    dynamicrafter_iface.launch(max_threads=1, share = True)
+    #dynamicrafter_iface.launch(server_name='0.0.0.0', server_port=8080, max_threads=1)

lvdm/__pycache__/basics.cpython-310.pyc ADDED Viewed

Binary file (3.12 kB). View file

lvdm/__pycache__/common.cpython-310.pyc ADDED Viewed

Binary file (4.52 kB). View file