LivePortrait

Sleeping

App Files Files Community

multimodalart HF staff commited on Jul 5

Commit

a891a57

•

1 Parent(s): 7454c19

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
.gitignore +17 -0
.vscode/settings.json +19 -0
LICENSE +21 -0
app.py +154 -0
assets/docs/inference.gif +0 -0
assets/docs/showcase.gif +3 -0
assets/docs/showcase2.gif +3 -0
assets/examples/driving/d0.mp4 +3 -0
assets/examples/driving/d1.mp4 +0 -0
assets/examples/driving/d2.mp4 +0 -0
assets/examples/driving/d3.mp4 +3 -0
assets/examples/driving/d5.mp4 +0 -0
assets/examples/driving/d6.mp4 +3 -0
assets/examples/driving/d7.mp4 +0 -0
assets/examples/driving/d8.mp4 +0 -0
assets/examples/driving/d9.mp4 +3 -0
assets/examples/source/s0.jpg +0 -0
assets/examples/source/s1.jpg +0 -0
assets/examples/source/s10.jpg +0 -0
assets/examples/source/s2.jpg +0 -0
assets/examples/source/s3.jpg +0 -0
assets/examples/source/s4.jpg +0 -0
assets/examples/source/s5.jpg +0 -0
assets/examples/source/s6.jpg +0 -0
assets/examples/source/s7.jpg +0 -0
assets/examples/source/s8.jpg +0 -0
assets/examples/source/s9.jpg +0 -0
assets/gradio_description_animation.md +7 -0
assets/gradio_description_retargeting.md +1 -0
assets/gradio_description_upload.md +2 -0
assets/gradio_title.md +10 -0
inference.py +33 -0
pretrained_weights/.gitkeep +0 -0
readme.md +143 -0
requirements.txt +22 -0
speed.py +192 -0
src/config/__init__.py +0 -0
src/config/argument_config.py +44 -0
src/config/base_config.py +29 -0
src/config/crop_config.py +18 -0
src/config/inference_config.py +49 -0
src/config/models.yaml +43 -0
src/gradio_pipeline.py +140 -0
src/live_portrait_pipeline.py +190 -0
src/live_portrait_wrapper.py +307 -0
src/modules/__init__.py +0 -0
src/modules/appearance_feature_extractor.py +48 -0
src/modules/convnextv2.py +149 -0
src/modules/dense_motion.py +104 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/docs/showcase.gif filter=lfs diff=lfs merge=lfs -text
+assets/docs/showcase2.gif filter=lfs diff=lfs merge=lfs -text
+assets/examples/driving/d0.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/examples/driving/d3.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/examples/driving/d6.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/examples/driving/d9.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+**/__pycache__/
+*.py[cod]
+**/*.py[cod]
+*$py.class
+# Model weights
+**/*.pth
+**/*.onnx
+# Ipython notebook
+*.ipynb
+# Temporary files or benchmark resources
+animations/*
+tmp/*

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "[python]": {
+        "editor.tabSize": 4
+    },
+    "files.eol": "\n",
+    "files.insertFinalNewline": true,
+    "files.trimFinalNewlines": true,
+    "files.trimTrailingWhitespace": true,
+    "files.exclude": {
+        "**/.git": true,
+        "**/.svn": true,
+        "**/.hg": true,
+        "**/CVS": true,
+        "**/.DS_Store": true,
+        "**/Thumbs.db": true,
+        "**/*.crswap": true,
+        "**/__pycache__": true
+    }
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Kuaishou Visual Generation and Interaction Center
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# coding: utf-8
+"""
+The entrance of the gradio
+"""
+import tyro
+import gradio as gr
+import os.path as osp
+from src.utils.helper import load_description
+from src.gradio_pipeline import GradioPipeline
+from src.config.crop_config import CropConfig
+from src.config.argument_config import ArgumentConfig
+from src.config.inference_config import InferenceConfig
+def partial_fields(target_class, kwargs):
+    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
+# set tyro theme
+tyro.extras.set_accent_color("bright_cyan")
+args = tyro.cli(ArgumentConfig)
+# specify configs for inference
+inference_cfg = partial_fields(InferenceConfig, args.__dict__)  # use attribute of args to initial InferenceConfig
+crop_cfg = partial_fields(CropConfig, args.__dict__)  # use attribute of args to initial CropConfig
+gradio_pipeline = GradioPipeline(
+    inference_cfg=inference_cfg,
+    crop_cfg=crop_cfg,
+    args=args
+)
+# assets
+title_md = "assets/gradio_title.md"
+example_portrait_dir = "assets/examples/source"
+example_video_dir = "assets/examples/driving"
+data_examples = [
+    [osp.join(example_portrait_dir, "s9.jpg"), osp.join(example_video_dir, "d0.mp4"), True, True, True, True],
+    [osp.join(example_portrait_dir, "s6.jpg"), osp.join(example_video_dir, "d0.mp4"), True, True, True, True],
+    [osp.join(example_portrait_dir, "s10.jpg"), osp.join(example_video_dir, "d5.mp4"), True, True, True, True],
+    [osp.join(example_portrait_dir, "s5.jpg"), osp.join(example_video_dir, "d6.mp4"), True, True, True, True],
+    [osp.join(example_portrait_dir, "s7.jpg"), osp.join(example_video_dir, "d7.mp4"), True, True, True, True],
+]
+#################### interface logic ####################
+# Define components first
+eye_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target eyes-open ratio")
+lip_retargeting_slider = gr.Slider(minimum=0, maximum=0.8, step=0.01, label="target lip-open ratio")
+retargeting_input_image = gr.Image(type="numpy")
+output_image = gr.Image(type="numpy")
+output_image_paste_back = gr.Image(type="numpy")
+output_video = gr.Video()
+output_video_concat = gr.Video()
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.HTML(load_description(title_md))
+    gr.Markdown(load_description("assets/gradio_description_upload.md"))
+    with gr.Row():
+        with gr.Accordion(open=True, label="Source Portrait"):
+            image_input = gr.Image(type="filepath")
+        with gr.Accordion(open=True, label="Driving Video"):
+            video_input = gr.Video()
+    gr.Markdown(load_description("assets/gradio_description_animation.md"))
+    with gr.Row():
+        with gr.Accordion(open=True, label="Animation Options"):
+            with gr.Row():
+                flag_relative_input = gr.Checkbox(value=True, label="relative motion")
+                flag_do_crop_input = gr.Checkbox(value=True, label="do crop")
+                flag_remap_input = gr.Checkbox(value=True, label="paste-back")
+    with gr.Row():
+        with gr.Column():
+            process_button_animation = gr.Button("🚀 Animate", variant="primary")
+        with gr.Column():
+            process_button_reset = gr.ClearButton([image_input, video_input, output_video, output_video_concat], value="🧹 Clear")
+    with gr.Row():
+        with gr.Column():
+            with gr.Accordion(open=True, label="The animated video in the original image space"):
+                output_video.render()
+        with gr.Column():
+            with gr.Accordion(open=True, label="The animated video"):
+                output_video_concat.render()
+    with gr.Row():
+        # Examples
+        gr.Markdown("## You could choose the examples below ⬇️")
+    with gr.Row():
+        gr.Examples(
+            examples=data_examples,
+            inputs=[
+                image_input,
+                video_input,
+                flag_relative_input,
+                flag_do_crop_input,
+                flag_remap_input
+            ],
+            examples_per_page=5
+        )
+    gr.Markdown(load_description("assets/gradio_description_retargeting.md"))
+    with gr.Row():
+        eye_retargeting_slider.render()
+        lip_retargeting_slider.render()
+    with gr.Row():
+        process_button_retargeting = gr.Button("🚗 Retargeting", variant="primary")
+        process_button_reset_retargeting = gr.ClearButton(
+            [
+                eye_retargeting_slider,
+                lip_retargeting_slider,
+                retargeting_input_image,
+                output_image,
+                output_image_paste_back
+            ],
+            value="🧹 Clear"
+        )
+    with gr.Row():
+        with gr.Column():
+            with gr.Accordion(open=True, label="Retargeting Input"):
+                retargeting_input_image.render()
+        with gr.Column():
+            with gr.Accordion(open=True, label="Retargeting Result"):
+                output_image.render()
+        with gr.Column():
+            with gr.Accordion(open=True, label="Paste-back Result"):
+                output_image_paste_back.render()
+    # binding functions for buttons
+    process_button_retargeting.click(
+        fn=gradio_pipeline.execute_image,
+        inputs=[eye_retargeting_slider, lip_retargeting_slider],
+        outputs=[output_image, output_image_paste_back],
+        show_progress=True
+    )
+    process_button_animation.click(
+        fn=gradio_pipeline.execute_video,
+        inputs=[
+            image_input,
+            video_input,
+            flag_relative_input,
+            flag_do_crop_input,
+            flag_remap_input
+        ],
+        outputs=[output_video, output_video_concat],
+        show_progress=True
+    )
+    image_input.change(
+        fn=gradio_pipeline.prepare_retargeting,
+        inputs=image_input,
+        outputs=[eye_retargeting_slider, lip_retargeting_slider, retargeting_input_image]
+    )
+##########################################################
+demo.launch(
+    server_name=args.server_name,
+    server_port=args.server_port,
+    share=args.share,
+)

assets/docs/inference.gif ADDED Viewed

assets/docs/showcase.gif ADDED Viewed

Git LFS Details

SHA256: 7bca5f38bfd555bf7c013312d87883afdf39d97fba719ac171c60f897af49e21
Pointer size: 132 Bytes
Size of remote file: 6.62 MB

assets/docs/showcase2.gif ADDED Viewed

Git LFS Details

SHA256: eb1fffb139681775780b2956e7d0289f55d199c1a3e14ab263887864d4b0d586
Pointer size: 132 Bytes
Size of remote file: 2.88 MB

assets/examples/driving/d0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63f6f9962e1fdf6e6722172e7a18155204858d5d5ce3b1e0646c150360c33bed
+size 2958395

assets/examples/driving/d1.mp4 ADDED Viewed

Binary file (48.8 kB). View file

assets/examples/driving/d2.mp4 ADDED Viewed

Binary file (47.8 kB). View file

assets/examples/driving/d3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef5c86e49b1b43dcb1449b499eb5a7f0cbae2f78aec08b5598193be1e4257099
+size 1430968

assets/examples/driving/d5.mp4 ADDED Viewed

Binary file (135 kB). View file

assets/examples/driving/d6.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00e3ea79bbf28cbdc4fbb67ec655d9a0fe876e880ec45af55ae481348d0c0fff
+size 1967790

assets/examples/driving/d7.mp4 ADDED Viewed

Binary file (185 kB). View file

assets/examples/driving/d8.mp4 ADDED Viewed

Binary file (312 kB). View file

assets/examples/driving/d9.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a414aa1d547be35306d692065a2157434bf40a6025ba8e30ce12e5bb322cc33
+size 2257929

assets/examples/source/s0.jpg ADDED Viewed

assets/examples/source/s1.jpg ADDED Viewed

assets/examples/source/s10.jpg ADDED Viewed

assets/examples/source/s2.jpg ADDED Viewed

assets/examples/source/s3.jpg ADDED Viewed

assets/examples/source/s4.jpg ADDED Viewed

assets/examples/source/s5.jpg ADDED Viewed

assets/examples/source/s6.jpg ADDED Viewed

assets/examples/source/s7.jpg ADDED Viewed

assets/examples/source/s8.jpg ADDED Viewed

assets/examples/source/s9.jpg ADDED Viewed

assets/gradio_description_animation.md ADDED Viewed

	@@ -0,0 +1,7 @@

+<span style="font-size: 1.2em;">🔥 To animate the source portrait with the driving video, please follow these steps:</span>
+<div style="font-size: 1.2em; margin-left: 20px;">
+    1. Specify the options in the <strong>Animation Options</strong> section. We recommend checking the <strong>do crop</strong> option when facial areas occupy a relatively small portion of your image.
+</div>
+<div style="font-size: 1.2em; margin-left: 20px;">
+    2. Press the <strong>🚀 Animate</strong> button and wait for a moment. Your animated video will appear in the result block. This may take a few moments.
+</div>

assets/gradio_description_retargeting.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ <span style="font-size: 1.2em;">🔥 To change the target eyes-open and lip-open ratio of the source portrait, please drag the sliders and then click the <strong>🚗 Retargeting</strong> button. The result would be shown in the middle block. You can try running it multiple times. <strong>😊 Set both ratios to 0.8 to see what's going on!</strong> </span>

assets/gradio_description_upload.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ## 🤗 This is the official gradio demo for LivePortrait.
2	+ <div style="font-size: 1.2em;">Please upload or use the webcam to get a source portrait to the <strong>Source Portrait</strong> field and a driving video to the <strong>Driving Video</strong> field.</div>

assets/gradio_title.md ADDED Viewed

	@@ -0,0 +1,10 @@

+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+    <div>
+        <h1>LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control</h1>
+        <div style="display: flex; justify-content: center; align-items: center; text-align: center;>
+            <a href="https://arxiv.org/pdf/2407.03168"><img src="https://img.shields.io/badge/arXiv-2407.03168-red"></a>
+            <a href="https://liveportrait.github.io"><img src="https://img.shields.io/badge/Project_Page-LivePortrait-green" alt="Project Page"></a>
+            <a href="https://github.com/KwaiVGI/LivePortrait"><img src="https://img.shields.io/badge/Github-Code-blue"></a>
+        </div>
+    </div>
+</div>

inference.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# coding: utf-8
+import tyro
+from src.config.argument_config import ArgumentConfig
+from src.config.inference_config import InferenceConfig
+from src.config.crop_config import CropConfig
+from src.live_portrait_pipeline import LivePortraitPipeline
+def partial_fields(target_class, kwargs):
+    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
+def main():
+    # set tyro theme
+    tyro.extras.set_accent_color("bright_cyan")
+    args = tyro.cli(ArgumentConfig)
+    # specify configs for inference
+    inference_cfg = partial_fields(InferenceConfig, args.__dict__)  # use attribute of args to initial InferenceConfig
+    crop_cfg = partial_fields(CropConfig, args.__dict__)  # use attribute of args to initial CropConfig
+    live_portrait_pipeline = LivePortraitPipeline(
+        inference_cfg=inference_cfg,
+        crop_cfg=crop_cfg
+    )
+    # run
+    live_portrait_pipeline.execute(args)
+if __name__ == '__main__':
+    main()

pretrained_weights/.gitkeep ADDED Viewed

File without changes

readme.md ADDED Viewed

	@@ -0,0 +1,143 @@

+<h1 align="center">LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control</h1>
+<div align='center'>
+    <a href='https://github.com/cleardusk' target='_blank'><strong>Jianzhu Guo</strong></a><sup> 1†</sup>&emsp;
+    <a href='https://github.com/KwaiVGI' target='_blank'><strong>Dingyun Zhang</strong></a><sup> 1,2</sup>&emsp;
+    <a href='https://github.com/KwaiVGI' target='_blank'><strong>Xiaoqiang Liu</strong></a><sup> 1</sup>&emsp;
+    <a href='https://github.com/KwaiVGI' target='_blank'><strong>Zhizhou Zhong</strong></a><sup> 1,3</sup>&emsp;
+    <a href='https://scholar.google.com.hk/citations?user=_8k1ubAAAAAJ' target='_blank'><strong>Yuan Zhang</strong></a><sup> 1</sup>&emsp;
+</div>
+<div align='center'>
+    <a href='https://scholar.google.com/citations?user=P6MraaYAAAAJ' target='_blank'><strong>Pengfei Wan</strong></a><sup> 1</sup>&emsp;
+    <a href='https://openreview.net/profile?id=~Di_ZHANG3' target='_blank'><strong>Di Zhang</strong></a><sup> 1</sup>&emsp;
+</div>
+<div align='center'>
+    <sup>1 </sup>Kuaishou Technology&emsp; <sup>2 </sup>University of Science and Technology of China&emsp; <sup>3 </sup>Fudan University&emsp;
+</div>
+<br>
+<div align="center">
+  <!-- <a href='LICENSE'><img src='https://img.shields.io/badge/license-MIT-yellow'></a> -->
+  <a href='https://liveportrait.github.io'><img src='https://img.shields.io/badge/Project-Homepage-green'></a>
+  <a href='https://arxiv.org/pdf/2407.03168'><img src='https://img.shields.io/badge/Paper-arXiv-red'></a>
+</div>
+<br>
+<p align="center">
+  <img src="./assets/docs/showcase2.gif" alt="showcase">
+  <br>
+  🔥 For more results, visit our <a href="https://liveportrait.github.io/"><strong>homepage</strong></a> 🔥
+</p>
+## 🔥 Updates
+- **`2024/07/04`**: 🔥 We released the initial version of the inference code and models. Continuous updates, stay tuned!
+- **`2024/07/04`**: 😊 We released the [homepage](https://liveportrait.github.io) and technical report on [arXiv](https://arxiv.org/pdf/2407.03168).
+## Introduction
+This repo, named **LivePortrait**, contains the official PyTorch implementation of our paper [LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control](https://arxiv.org/pdf/2407.03168).
+We are actively updating and improving this repository. If you find any bugs or have suggestions, welcome to raise issues or submit pull requests (PR) 💖.
+## 🔥 Getting Started
+### 1. Clone the code and prepare the environment
+```bash
+git clone https://github.com/KwaiVGI/LivePortrait
+cd LivePortrait
+# create env using conda
+conda create -n LivePortrait python==3.9.18
+conda activate LivePortrait
+# install dependencies with pip
+pip install -r requirements.txt
+```
+### 2. Download pretrained weights
+Download our pretrained LivePortrait weights and face detection models of InsightFace from [Google Drive](https://drive.google.com/drive/folders/1UtKgzKjFAOmZkhNK-OYT0caJ_w2XAnib) or [Baidu Yun](https://pan.baidu.com/s/1MGctWmNla_vZxDbEp2Dtzw?pwd=z5cn). We have packed all weights in one directory 😊. Unzip and place them in `./pretrained_weights` ensuring the directory structure is as follows:
+```text
+pretrained_weights
+├── insightface
+│   └── models
+│       └── buffalo_l
+│           ├── 2d106det.onnx
+│           └── det_10g.onnx
+└── liveportrait
+    ├── base_models
+    │   ├── appearance_feature_extractor.pth
+    │   ├── motion_extractor.pth
+    │   ├── spade_generator.pth
+    │   └── warping_module.pth
+    ├── landmark.onnx
+    └── retargeting_models
+        └── stitching_retargeting_module.pth
+```
+### 3. Inference 🚀
+```bash
+python inference.py
+```
+If the script runs successfully, you will get an output mp4 file named `animations/s6--d0_concat.mp4`. This file includes the following results: driving video, input image, and generated result.
+<p align="center">
+  <img src="./assets/docs/inference.gif" alt="image">
+</p>
+Or, you can change the input by specifying the `-s` and `-d` arguments:
+```bash
+python inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4
+# or disable pasting back
+python inference.py -s assets/examples/source/s9.jpg -d assets/examples/driving/d0.mp4 --no_flag_pasteback
+# more options to see
+python inference.py -h
+```
+**More interesting results can be found in our [Homepage](https://liveportrait.github.io)** 😊
+### 4. Gradio interface
+We also provide a Gradio interface for a better experience, just run by:
+```bash
+python app.py
+```
+### 5. Inference speed evaluation 🚀🚀🚀
+We have also provided a script to evaluate the inference speed of each module:
+```bash
+python speed.py
+```
+Below are the results of inferring one frame on an RTX 4090 GPU using the native PyTorch framework with `torch.compile`:
+| Model                             | Parameters(M) | Model Size(MB) | Inference(ms) |
+|-----------------------------------|:-------------:|:--------------:|:-------------:|
+| Appearance Feature Extractor      |     0.84      |       3.3      |     0.82      |
+| Motion Extractor                  |     28.12     |       108      |     0.84      |
+| Spade Generator                   |     55.37     |       212      |     7.59      |
+| Warping Module                    |     45.53     |       174      |     5.21      |
+| Stitching and Retargeting Modules|     0.23      |       2.3      |     0.31      |
+*Note: the listed values of Stitching and Retargeting Modules represent the combined parameter counts and the total sequential inference time of three MLP networks.*
+## Acknowledgements
+We would like to thank the contributors of [FOMM](https://github.com/AliaksandrSiarohin/first-order-model), [Open Facevid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis), [SPADE](https://github.com/NVlabs/SPADE), [InsightFace](https://github.com/deepinsight/insightface) repositories, for their open research and contributions.
+## Citation 💖
+If you find LivePortrait useful for your research, welcome to 🌟 this repo and cite our work using the following BibTeX:
+```bibtex
+@article{guo2024live,
+  title   = {LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control},
+  author  = {Jianzhu Guo and Dingyun Zhang and Xiaoqiang Liu and Zhizhou Zhong and Yuan Zhang and Pengfei Wan and Di Zhang},
+  year    = {2024},
+  journal = {arXiv preprint:2407.03168},
+}
+```

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.3.0
+torchvision==0.18.0
+torchaudio==2.3.0
+numpy==1.26.4
+pyyaml==6.0.1
+opencv-python==4.10.0.84
+scipy==1.13.1
+imageio==2.34.2
+lmdb==1.4.1
+tqdm==4.66.4
+rich==13.7.1
+ffmpeg==1.4
+onnxruntime-gpu==1.18.0
+onnx==1.16.1
+scikit-image==0.24.0
+albumentations==1.4.10
+matplotlib==3.9.0
+imageio-ffmpeg==0.5.1
+tyro==0.8.5
+gradio==4.37.1

speed.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# coding: utf-8
+"""
+Benchmark the inference speed of each module in LivePortrait.
+TODO: heavy GPT style, need to refactor
+"""
+import yaml
+import torch
+import time
+import numpy as np
+from src.utils.helper import load_model, concat_feat
+from src.config.inference_config import InferenceConfig
+def initialize_inputs(batch_size=1):
+    """
+    Generate random input tensors and move them to GPU
+    """
+    feature_3d = torch.randn(batch_size, 32, 16, 64, 64).cuda().half()
+    kp_source = torch.randn(batch_size, 21, 3).cuda().half()
+    kp_driving = torch.randn(batch_size, 21, 3).cuda().half()
+    source_image = torch.randn(batch_size, 3, 256, 256).cuda().half()
+    generator_input = torch.randn(batch_size, 256, 64, 64).cuda().half()
+    eye_close_ratio = torch.randn(batch_size, 3).cuda().half()
+    lip_close_ratio = torch.randn(batch_size, 2).cuda().half()
+    feat_stitching = concat_feat(kp_source, kp_driving).half()
+    feat_eye = concat_feat(kp_source, eye_close_ratio).half()
+    feat_lip = concat_feat(kp_source, lip_close_ratio).half()
+    inputs = {
+        'feature_3d': feature_3d,
+        'kp_source': kp_source,
+        'kp_driving': kp_driving,
+        'source_image': source_image,
+        'generator_input': generator_input,
+        'feat_stitching': feat_stitching,
+        'feat_eye': feat_eye,
+        'feat_lip': feat_lip
+    }
+    return inputs
+def load_and_compile_models(cfg, model_config):
+    """
+    Load and compile models for inference
+    """
+    appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor')
+    motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor')
+    warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module')
+    spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator')
+    stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module')
+    models_with_params = [
+        ('Appearance Feature Extractor', appearance_feature_extractor),
+        ('Motion Extractor', motion_extractor),
+        ('Warping Network', warping_module),
+        ('SPADE Decoder', spade_generator)
+    ]
+    compiled_models = {}
+    for name, model in models_with_params:
+        model = model.half()
+        model = torch.compile(model, mode='max-autotune')  # Optimize for inference
+        model.eval()  # Switch to evaluation mode
+        compiled_models[name] = model
+    retargeting_models = ['stitching', 'eye', 'lip']
+    for retarget in retargeting_models:
+        module = stitching_retargeting_module[retarget].half()
+        module = torch.compile(module, mode='max-autotune')  # Optimize for inference
+        module.eval()  # Switch to evaluation mode
+        stitching_retargeting_module[retarget] = module
+    return compiled_models, stitching_retargeting_module
+def warm_up_models(compiled_models, stitching_retargeting_module, inputs):
+    """
+    Warm up models to prepare them for benchmarking
+    """
+    print("Warm up start!")
+    with torch.no_grad():
+        for _ in range(10):
+            compiled_models['Appearance Feature Extractor'](inputs['source_image'])
+            compiled_models['Motion Extractor'](inputs['source_image'])
+            compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])
+            compiled_models['SPADE Decoder'](inputs['generator_input'])  # Adjust input as required
+            stitching_retargeting_module['stitching'](inputs['feat_stitching'])
+            stitching_retargeting_module['eye'](inputs['feat_eye'])
+            stitching_retargeting_module['lip'](inputs['feat_lip'])
+    print("Warm up end!")
+def measure_inference_times(compiled_models, stitching_retargeting_module, inputs):
+    """
+    Measure inference times for each model
+    """
+    times = {name: [] for name in compiled_models.keys()}
+    times['Retargeting Models'] = []
+    overall_times = []
+    with torch.no_grad():
+        for _ in range(100):
+            torch.cuda.synchronize()
+            overall_start = time.time()
+            start = time.time()
+            compiled_models['Appearance Feature Extractor'](inputs['source_image'])
+            torch.cuda.synchronize()
+            times['Appearance Feature Extractor'].append(time.time() - start)
+            start = time.time()
+            compiled_models['Motion Extractor'](inputs['source_image'])
+            torch.cuda.synchronize()
+            times['Motion Extractor'].append(time.time() - start)
+            start = time.time()
+            compiled_models['Warping Network'](inputs['feature_3d'], inputs['kp_driving'], inputs['kp_source'])
+            torch.cuda.synchronize()
+            times['Warping Network'].append(time.time() - start)
+            start = time.time()
+            compiled_models['SPADE Decoder'](inputs['generator_input'])  # Adjust input as required
+            torch.cuda.synchronize()
+            times['SPADE Decoder'].append(time.time() - start)
+            start = time.time()
+            stitching_retargeting_module['stitching'](inputs['feat_stitching'])
+            stitching_retargeting_module['eye'](inputs['feat_eye'])
+            stitching_retargeting_module['lip'](inputs['feat_lip'])
+            torch.cuda.synchronize()
+            times['Retargeting Models'].append(time.time() - start)
+            overall_times.append(time.time() - overall_start)
+    return times, overall_times
+def print_benchmark_results(compiled_models, stitching_retargeting_module, retargeting_models, times, overall_times):
+    """
+    Print benchmark results with average and standard deviation of inference times
+    """
+    average_times = {name: np.mean(times[name]) * 1000 for name in times.keys()}
+    std_times = {name: np.std(times[name]) * 1000 for name in times.keys()}
+    for name, model in compiled_models.items():
+        num_params = sum(p.numel() for p in model.parameters())
+        num_params_in_millions = num_params / 1e6
+        print(f"Number of parameters for {name}: {num_params_in_millions:.2f} M")
+    for index, retarget in enumerate(retargeting_models):
+        num_params = sum(p.numel() for p in stitching_retargeting_module[retarget].parameters())
+        num_params_in_millions = num_params / 1e6
+        print(f"Number of parameters for part_{index} in Stitching and Retargeting Modules: {num_params_in_millions:.2f} M")
+    for name, avg_time in average_times.items():
+        std_time = std_times[name]
+        print(f"Average inference time for {name} over 100 runs: {avg_time:.2f} ms (std: {std_time:.2f} ms)")
+def main():
+    """
+    Main function to benchmark speed and model parameters
+    """
+    # Sample input tensors
+    inputs = initialize_inputs()
+    # Load configuration
+    cfg = InferenceConfig(device_id=0)
+    model_config_path = cfg.models_config
+    with open(model_config_path, 'r') as file:
+        model_config = yaml.safe_load(file)
+    # Load and compile models
+    compiled_models, stitching_retargeting_module = load_and_compile_models(cfg, model_config)
+    # Warm up models
+    warm_up_models(compiled_models, stitching_retargeting_module, inputs)
+    # Measure inference times
+    times, overall_times = measure_inference_times(compiled_models, stitching_retargeting_module, inputs)
+    # Print benchmark results
+    print_benchmark_results(compiled_models, stitching_retargeting_module, ['stitching', 'eye', 'lip'], times, overall_times)
+if __name__ == "__main__":
+    main()

src/config/__init__.py ADDED Viewed

File without changes

src/config/argument_config.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# coding: utf-8
+"""
+config for user
+"""
+import os.path as osp
+from dataclasses import dataclass
+import tyro
+from typing_extensions import Annotated
+from .base_config import PrintableConfig, make_abs_path
+@dataclass(repr=False)  # use repr from PrintableConfig
+class ArgumentConfig(PrintableConfig):
+    ########## input arguments ##########
+    source_image: Annotated[str, tyro.conf.arg(aliases=["-s"])] = make_abs_path('../../assets/examples/source/s6.jpg')  # path to the source portrait
+    driving_info:  Annotated[str, tyro.conf.arg(aliases=["-d"])] = make_abs_path('../../assets/examples/driving/d0.mp4')  # path to driving video or template (.pkl format)
+    output_dir: Annotated[str, tyro.conf.arg(aliases=["-o"])] = 'animations/'  # directory to save output video
+    #####################################
+    ########## inference arguments ##########
+    device_id: int = 0
+    flag_lip_zero : bool = True # whether let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
+    flag_eye_retargeting: bool = False
+    flag_lip_retargeting: bool = False
+    flag_stitching: bool = True  # we recommend setting it to True!
+    flag_relative: bool = True  # whether to use relative motion
+    flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
+    flag_do_crop: bool = True  # whether to crop the source portrait to the face-cropping space
+    flag_do_rot: bool = True  # whether to conduct the rotation when flag_do_crop is True
+    #########################################
+    ########## crop arguments ##########
+    dsize: int = 512
+    scale: float = 2.3
+    vx_ratio: float = 0  # vx ratio
+    vy_ratio: float = -0.125  # vy ratio +up, -down
+    ####################################
+    ########## gradio arguments ##########
+    server_port: Annotated[int, tyro.conf.arg(aliases=["-p"])]  = 8890
+    share: bool = True
+    server_name: str = "0.0.0.0"

src/config/base_config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# coding: utf-8
+"""
+pretty printing class
+"""
+from __future__ import annotations
+import os.path as osp
+from typing import Tuple
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+class PrintableConfig:  # pylint: disable=too-few-public-methods
+    """Printable Config defining str function"""
+    def __repr__(self):
+        lines = [self.__class__.__name__ + ":"]
+        for key, val in vars(self).items():
+            if isinstance(val, Tuple):
+                flattened_val = "["
+                for item in val:
+                    flattened_val += str(item) + "\n"
+                flattened_val = flattened_val.rstrip("\n")
+                val = flattened_val + "]"
+            lines += f"{key}: {str(val)}".split("\n")
+        return "\n    ".join(lines)

src/config/crop_config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# coding: utf-8
+"""
+parameters used for crop faces
+"""
+import os.path as osp
+from dataclasses import dataclass
+from typing import Union, List
+from .base_config import PrintableConfig
+@dataclass(repr=False)  # use repr from PrintableConfig
+class CropConfig(PrintableConfig):
+    dsize: int = 512  # crop size
+    scale: float = 2.3  # scale factor
+    vx_ratio: float = 0  # vx ratio
+    vy_ratio: float = -0.125  # vy ratio +up, -down

src/config/inference_config.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# coding: utf-8
+"""
+config dataclass used for inference
+"""
+import os.path as osp
+from dataclasses import dataclass
+from typing import Literal, Tuple
+from .base_config import PrintableConfig, make_abs_path
+@dataclass(repr=False)  # use repr from PrintableConfig
+class InferenceConfig(PrintableConfig):
+    models_config: str = make_abs_path('./models.yaml')  # portrait animation config
+    checkpoint_F: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/appearance_feature_extractor.pth')  # path to checkpoint
+    checkpoint_M: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/motion_extractor.pth')  # path to checkpoint
+    checkpoint_G: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/spade_generator.pth')  # path to checkpoint
+    checkpoint_W: str = make_abs_path('../../pretrained_weights/liveportrait/base_models/warping_module.pth')  # path to checkpoint
+    checkpoint_S: str = make_abs_path('../../pretrained_weights/liveportrait/retargeting_models/stitching_retargeting_module.pth')  # path to checkpoint
+    flag_use_half_precision: bool = True  # whether to use half precision
+    flag_lip_zero: bool = True  # whether let the lip to close state before animation, only take effect when flag_eye_retargeting and flag_lip_retargeting is False
+    lip_zero_threshold: float = 0.03
+    flag_eye_retargeting: bool = False
+    flag_lip_retargeting: bool = False
+    flag_stitching: bool = True  # we recommend setting it to True!
+    flag_relative: bool = True  # whether to use relative motion
+    anchor_frame: int = 0  # set this value if find_best_frame is True
+    input_shape: Tuple[int, int] = (256, 256)  # input shape
+    output_format: Literal['mp4', 'gif'] = 'mp4'  # output video format
+    output_fps: int = 30  # fps for output video
+    crf: int = 15  # crf for output video
+    flag_write_result: bool = True  # whether to write output video
+    flag_pasteback: bool = True  # whether to paste-back/stitch the animated face cropping from the face-cropping space to the original image space
+    mask_crop = None
+    flag_write_gif: bool = False
+    size_gif: int = 256
+    ref_max_shape: int = 1280
+    ref_shape_n: int = 2
+    device_id: int = 0
+    flag_do_crop: bool = False  # whether to crop the source portrait to the face-cropping space
+    flag_do_rot: bool = True  # whether to conduct the rotation when flag_do_crop is True

src/config/models.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+model_params:
+  appearance_feature_extractor_params: # the F in the paper
+    image_channel: 3
+    block_expansion: 64
+    num_down_blocks: 2
+    max_features: 512
+    reshape_channel: 32
+    reshape_depth: 16
+    num_resblocks: 6
+  motion_extractor_params: # the M in the paper
+    num_kp: 21
+    backbone: convnextv2_tiny
+  warping_module_params: # the W in the paper
+    num_kp: 21
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+    reshape_channel: 32
+    estimate_occlusion_map: True
+    dense_motion_params:
+      block_expansion: 32
+      max_features: 1024
+      num_blocks: 5
+      reshape_depth: 16
+      compress: 4
+  spade_generator_params: # the G in the paper
+    upscale: 2 # represents upsample factor 256x256 -> 512x512
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+  stitching_retargeting_module_params: # the S in the paper
+    stitching:
+      input_size: 126 # (21*3)*2
+      hidden_sizes: [128, 128, 64]
+      output_size: 65 # (21*3)+2(tx,ty)
+    lip:
+      input_size: 65 # (21*3)+2
+      hidden_sizes: [128, 128, 64]
+      output_size: 63 # (21*3)
+    eye:
+      input_size: 66 # (21*3)+3
+      hidden_sizes: [256, 256, 128, 128, 64]
+      output_size: 63 # (21*3)

src/gradio_pipeline.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# coding: utf-8
+"""
+Pipeline for gradio
+"""
+import gradio as gr
+from .config.argument_config import ArgumentConfig
+from .live_portrait_pipeline import LivePortraitPipeline
+from .utils.io import load_img_online
+from .utils.rprint import rlog as log
+from .utils.crop import prepare_paste_back, paste_back
+from .utils.camera import get_rotation_matrix
+from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
+def update_args(args, user_args):
+    """update the args according to user inputs
+    """
+    for k, v in user_args.items():
+        if hasattr(args, k):
+            setattr(args, k, v)
+    return args
+class GradioPipeline(LivePortraitPipeline):
+    def __init__(self, inference_cfg, crop_cfg, args: ArgumentConfig):
+        super().__init__(inference_cfg, crop_cfg)
+        # self.live_portrait_wrapper = self.live_portrait_wrapper
+        self.args = args
+        # for single image retargeting
+        self.start_prepare = False
+        self.f_s_user = None
+        self.x_c_s_info_user = None
+        self.x_s_user = None
+        self.source_lmk_user = None
+        self.mask_ori = None
+        self.img_rgb = None
+        self.crop_M_c2o = None
+    def execute_video(
+        self,
+        input_image_path,
+        input_video_path,
+        flag_relative_input,
+        flag_do_crop_input,
+        flag_remap_input,
+        ):
+        """ for video driven potrait animation
+        """
+        if input_image_path is not None and input_video_path is not None:
+            args_user = {
+                'source_image': input_image_path,
+                'driving_info': input_video_path,
+                'flag_relative': flag_relative_input,
+                'flag_do_crop': flag_do_crop_input,
+                'flag_pasteback': flag_remap_input,
+            }
+            # update config from user input
+            self.args = update_args(self.args, args_user)
+            self.live_portrait_wrapper.update_config(self.args.__dict__)
+            self.cropper.update_config(self.args.__dict__)
+            # video driven animation
+            video_path, video_path_concat = self.execute(self.args)
+            gr.Info("Run successfully!", duration=2)
+            return video_path, video_path_concat,
+        else:
+            raise gr.Error("The input source portrait or driving video hasn't been prepared yet 💥!", duration=5)
+    def execute_image(self, input_eye_ratio: float, input_lip_ratio: float):
+        """ for single image retargeting
+        """
+        if input_eye_ratio is None or input_eye_ratio is None:
+            raise gr.Error("Invalid ratio input 💥!", duration=5)
+        elif self.f_s_user is None:
+            if self.start_prepare:
+                raise gr.Error(
+                    "The source portrait is under processing 💥! Please wait for a second.",
+                    duration=5
+                )
+            else:
+                raise gr.Error(
+                    "The source portrait hasn't been prepared yet 💥! Please scroll to the top of the page to upload.",
+                    duration=5
+                )
+        else:
+            # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
+            combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio([[input_eye_ratio]], self.source_lmk_user)
+            eyes_delta = self.live_portrait_wrapper.retarget_eye(self.x_s_user, combined_eye_ratio_tensor)
+            # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+            combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio([[input_lip_ratio]], self.source_lmk_user)
+            lip_delta = self.live_portrait_wrapper.retarget_lip(self.x_s_user, combined_lip_ratio_tensor)
+            num_kp = self.x_s_user.shape[1]
+            # default: use x_s
+            x_d_new = self.x_s_user + eyes_delta.reshape(-1, num_kp, 3) + lip_delta.reshape(-1, num_kp, 3)
+            # D(W(f_s; x_s, x′_d))
+            out = self.live_portrait_wrapper.warp_decode(self.f_s_user, self.x_s_user, x_d_new)
+            out = self.live_portrait_wrapper.parse_output(out['out'])[0]
+            out_to_ori_blend = paste_back(out, self.crop_M_c2o, self.img_rgb, self.mask_ori)
+            gr.Info("Run successfully!", duration=2)
+            return out, out_to_ori_blend
+    def prepare_retargeting(self, input_image_path, flag_do_crop = True):
+        """ for single image retargeting
+        """
+        if input_image_path is not None:
+            gr.Info("Upload successfully!", duration=2)
+            self.start_prepare = True
+            inference_cfg = self.live_portrait_wrapper.cfg
+            ######## process source portrait ########
+            img_rgb = load_img_online(input_image_path, mode='rgb', max_dim=1280, n=16)
+            log(f"Load source image from {input_image_path}.")
+            crop_info = self.cropper.crop_single_image(img_rgb)
+            if flag_do_crop:
+                I_s = self.live_portrait_wrapper.prepare_source(crop_info['img_crop_256x256'])
+            else:
+                I_s = self.live_portrait_wrapper.prepare_source(img_rgb)
+            x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
+            R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+            ############################################
+            # record global info for next time use
+            self.f_s_user = self.live_portrait_wrapper.extract_feature_3d(I_s)
+            self.x_s_user = self.live_portrait_wrapper.transform_keypoint(x_s_info)
+            self.x_s_info_user = x_s_info
+            self.source_lmk_user = crop_info['lmk_crop']
+            self.img_rgb = img_rgb
+            self.crop_M_c2o = crop_info['M_c2o']
+            self.mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))
+            # update slider
+            eye_close_ratio = calc_eye_close_ratio(self.source_lmk_user[None])
+            eye_close_ratio = float(eye_close_ratio.squeeze(0).mean())
+            lip_close_ratio = calc_lip_close_ratio(self.source_lmk_user[None])
+            lip_close_ratio = float(lip_close_ratio.squeeze(0).mean())
+            # for vis
+            self.I_s_vis = self.live_portrait_wrapper.parse_output(I_s)[0]
+            return eye_close_ratio, lip_close_ratio, self.I_s_vis
+        else:
+            # when press the clear button, go here
+            return 0.8, 0.8, self.I_s_vis

src/live_portrait_pipeline.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# coding: utf-8
+"""
+Pipeline of LivePortrait
+"""
+# TODO:
+# 1. 当前假定所有的模板都是已经裁好的，需要修改下
+# 2. pick样例图 source + driving
+import cv2
+import numpy as np
+import pickle
+import os.path as osp
+from rich.progress import track
+from .config.argument_config import ArgumentConfig
+from .config.inference_config import InferenceConfig
+from .config.crop_config import CropConfig
+from .utils.cropper import Cropper
+from .utils.camera import get_rotation_matrix
+from .utils.video import images2video, concat_frames
+from .utils.crop import _transform_img, prepare_paste_back, paste_back
+from .utils.retargeting_utils import calc_lip_close_ratio
+from .utils.io import load_image_rgb, load_driving_info, resize_to_limit
+from .utils.helper import mkdir, basename, dct2cuda, is_video, is_template
+from .utils.rprint import rlog as log
+from .live_portrait_wrapper import LivePortraitWrapper
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+class LivePortraitPipeline(object):
+    def __init__(self, inference_cfg: InferenceConfig, crop_cfg: CropConfig):
+        self.live_portrait_wrapper: LivePortraitWrapper = LivePortraitWrapper(cfg=inference_cfg)
+        self.cropper = Cropper(crop_cfg=crop_cfg)
+    def execute(self, args: ArgumentConfig):
+        inference_cfg = self.live_portrait_wrapper.cfg # for convenience
+        ######## process source portrait ########
+        img_rgb = load_image_rgb(args.source_image)
+        img_rgb = resize_to_limit(img_rgb, inference_cfg.ref_max_shape, inference_cfg.ref_shape_n)
+        log(f"Load source image from {args.source_image}")
+        crop_info = self.cropper.crop_single_image(img_rgb)
+        source_lmk = crop_info['lmk_crop']
+        img_crop, img_crop_256x256 = crop_info['img_crop'], crop_info['img_crop_256x256']
+        if inference_cfg.flag_do_crop:
+            I_s = self.live_portrait_wrapper.prepare_source(img_crop_256x256)
+        else:
+            I_s = self.live_portrait_wrapper.prepare_source(img_rgb)
+        x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
+        x_c_s = x_s_info['kp']
+        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+        f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)
+        x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)
+        if inference_cfg.flag_lip_zero:
+            # let lip-open scalar to be 0 at first
+            c_d_lip_before_animation = [0.]
+            combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
+            if combined_lip_ratio_tensor_before_animation[0][0] < inference_cfg.lip_zero_threshold:
+                inference_cfg.flag_lip_zero = False
+            else:
+                lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)
+        ############################################
+        ######## process driving info ########
+        if is_video(args.driving_info):
+            log(f"Load from video file (mp4 mov avi etc...): {args.driving_info}")
+            # TODO: 这里track一下驱动视频 -> 构建模板
+            driving_rgb_lst = load_driving_info(args.driving_info)
+            driving_rgb_lst_256 = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]
+            I_d_lst = self.live_portrait_wrapper.prepare_driving_videos(driving_rgb_lst_256)
+            n_frames = I_d_lst.shape[0]
+            if inference_cfg.flag_eye_retargeting or inference_cfg.flag_lip_retargeting:
+                driving_lmk_lst = self.cropper.get_retargeting_lmk_info(driving_rgb_lst)
+                input_eye_ratio_lst, input_lip_ratio_lst = self.live_portrait_wrapper.calc_retargeting_ratio(source_lmk, driving_lmk_lst)
+        elif is_template(args.driving_info):
+            log(f"Load from video templates {args.driving_info}")
+            with open(args.driving_info, 'rb') as f:
+                template_lst, driving_lmk_lst = pickle.load(f)
+            n_frames = template_lst[0]['n_frames']
+            input_eye_ratio_lst, input_lip_ratio_lst = self.live_portrait_wrapper.calc_retargeting_ratio(source_lmk, driving_lmk_lst)
+        else:
+            raise Exception("Unsupported driving types!")
+        #########################################
+        ######## prepare for pasteback ########
+        if inference_cfg.flag_pasteback:
+            mask_ori = prepare_paste_back(inference_cfg.mask_crop, crop_info['M_c2o'], dsize=(img_rgb.shape[1], img_rgb.shape[0]))
+            I_p_paste_lst = []
+        #########################################
+        I_p_lst = []
+        R_d_0, x_d_0_info = None, None
+        for i in track(range(n_frames), description='Animating...', total=n_frames):
+            if is_video(args.driving_info):
+                # extract kp info by M
+                I_d_i = I_d_lst[i]
+                x_d_i_info = self.live_portrait_wrapper.get_kp_info(I_d_i)
+                R_d_i = get_rotation_matrix(x_d_i_info['pitch'], x_d_i_info['yaw'], x_d_i_info['roll'])
+            else:
+                # from template
+                x_d_i_info = template_lst[i]
+                x_d_i_info = dct2cuda(x_d_i_info, inference_cfg.device_id)
+                R_d_i = x_d_i_info['R_d']
+            if i == 0:
+                R_d_0 = R_d_i
+                x_d_0_info = x_d_i_info
+            if inference_cfg.flag_relative:
+                R_new = (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+                delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+                scale_new = x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+                t_new = x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+            else:
+                R_new = R_d_i
+                delta_new = x_d_i_info['exp']
+                scale_new = x_s_info['scale']
+                t_new = x_d_i_info['t']
+            t_new[..., 2].fill_(0) # zero tz
+            x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+            # Algorithm 1:
+            if not inference_cfg.flag_stitching and not inference_cfg.flag_eye_retargeting and not inference_cfg.flag_lip_retargeting:
+                # without stitching or retargeting
+                if inference_cfg.flag_lip_zero:
+                    x_d_i_new += lip_delta_before_animation.reshape(-1, x_s.shape[1], 3)
+                else:
+                    pass
+            elif inference_cfg.flag_stitching and not inference_cfg.flag_eye_retargeting and not inference_cfg.flag_lip_retargeting:
+                # with stitching and without retargeting
+                if inference_cfg.flag_lip_zero:
+                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + lip_delta_before_animation.reshape(-1, x_s.shape[1], 3)
+                else:
+                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
+            else:
+                eyes_delta, lip_delta = None, None
+                if inference_cfg.flag_eye_retargeting:
+                    c_d_eyes_i = input_eye_ratio_lst[i]
+                    combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eyes_i, source_lmk)
+                    # ∆_eyes,i = R_eyes(x_s; c_s,eyes, c_d,eyes,i)
+                    eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor)
+                if inference_cfg.flag_lip_retargeting:
+                    c_d_lip_i = input_lip_ratio_lst[i]
+                    combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_i, source_lmk)
+                    # ∆_lip,i = R_lip(x_s; c_s,lip, c_d,lip,i)
+                    lip_delta = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor)
+                if inference_cfg.flag_relative:  # use x_s
+                    x_d_i_new = x_s + \
+                        (eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \
+                        (lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0)
+                else:  # use x_d,i
+                    x_d_i_new = x_d_i_new + \
+                        (eyes_delta.reshape(-1, x_s.shape[1], 3) if eyes_delta is not None else 0) + \
+                        (lip_delta.reshape(-1, x_s.shape[1], 3) if lip_delta is not None else 0)
+                if inference_cfg.flag_stitching:
+                    x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
+            out = self.live_portrait_wrapper.warp_decode(f_s, x_s, x_d_i_new)
+            I_p_i = self.live_portrait_wrapper.parse_output(out['out'])[0]
+            I_p_lst.append(I_p_i)
+            if inference_cfg.flag_pasteback:
+                I_p_i_to_ori_blend = paste_back(I_p_i, crop_info['M_c2o'], img_rgb, mask_ori)
+                I_p_paste_lst.append(I_p_i_to_ori_blend)
+        mkdir(args.output_dir)
+        wfp_concat = None
+        if is_video(args.driving_info):
+            frames_concatenated = concat_frames(I_p_lst, driving_rgb_lst, img_crop_256x256)
+            # save (driving frames, source image, drived frames) result
+            wfp_concat = osp.join(args.output_dir, f'{basename(args.source_image)}--{basename(args.driving_info)}_concat.mp4')
+            images2video(frames_concatenated, wfp=wfp_concat)
+        # save drived result
+        wfp = osp.join(args.output_dir, f'{basename(args.source_image)}--{basename(args.driving_info)}.mp4')
+        if inference_cfg.flag_pasteback:
+            images2video(I_p_paste_lst, wfp=wfp)
+        else:
+            images2video(I_p_lst, wfp=wfp)
+        return wfp, wfp_concat

src/live_portrait_wrapper.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# coding: utf-8
+"""
+Wrapper for LivePortrait core functions
+"""
+import os.path as osp
+import numpy as np
+import cv2
+import torch
+import yaml
+from .utils.timer import Timer
+from .utils.helper import load_model, concat_feat
+from .utils.camera import headpose_pred_to_degree, get_rotation_matrix
+from .utils.retargeting_utils import calc_eye_close_ratio, calc_lip_close_ratio
+from .config.inference_config import InferenceConfig
+from .utils.rprint import rlog as log
+class LivePortraitWrapper(object):
+    def __init__(self, cfg: InferenceConfig):
+        model_config = yaml.load(open(cfg.models_config, 'r'), Loader=yaml.SafeLoader)
+        # init F
+        self.appearance_feature_extractor = load_model(cfg.checkpoint_F, model_config, cfg.device_id, 'appearance_feature_extractor')
+        log(f'Load appearance_feature_extractor done.')
+        # init M
+        self.motion_extractor = load_model(cfg.checkpoint_M, model_config, cfg.device_id, 'motion_extractor')
+        log(f'Load motion_extractor done.')
+        # init W
+        self.warping_module = load_model(cfg.checkpoint_W, model_config, cfg.device_id, 'warping_module')
+        log(f'Load warping_module done.')
+        # init G
+        self.spade_generator = load_model(cfg.checkpoint_G, model_config, cfg.device_id, 'spade_generator')
+        log(f'Load spade_generator done.')
+        # init S and R
+        if cfg.checkpoint_S is not None and osp.exists(cfg.checkpoint_S):
+            self.stitching_retargeting_module = load_model(cfg.checkpoint_S, model_config, cfg.device_id, 'stitching_retargeting_module')
+            log(f'Load stitching_retargeting_module done.')
+        else:
+            self.stitching_retargeting_module = None
+        self.cfg = cfg
+        self.device_id = cfg.device_id
+        self.timer = Timer()
+    def update_config(self, user_args):
+        for k, v in user_args.items():
+            if hasattr(self.cfg, k):
+                setattr(self.cfg, k, v)
+    def prepare_source(self, img: np.ndarray) -> torch.Tensor:
+        """ construct the input as standard
+        img: HxWx3, uint8, 256x256
+        """
+        h, w = img.shape[:2]
+        if h != self.cfg.input_shape[0] or w != self.cfg.input_shape[1]:
+            x = cv2.resize(img, (self.cfg.input_shape[0], self.cfg.input_shape[1]))
+        else:
+            x = img.copy()
+        if x.ndim == 3:
+            x = x[np.newaxis].astype(np.float32) / 255.  # HxWx3 -> 1xHxWx3, normalized to 0~1
+        elif x.ndim == 4:
+            x = x.astype(np.float32) / 255.  # BxHxWx3, normalized to 0~1
+        else:
+            raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
+        x = np.clip(x, 0, 1)  # clip to 0~1
+        x = torch.from_numpy(x).permute(0, 3, 1, 2)  # 1xHxWx3 -> 1x3xHxW
+        x = x.cuda(self.device_id)
+        return x
+    def prepare_driving_videos(self, imgs) -> torch.Tensor:
+        """ construct the input as standard
+        imgs: NxBxHxWx3, uint8
+        """
+        if isinstance(imgs, list):
+            _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+        elif isinstance(imgs, np.ndarray):
+            _imgs = imgs
+        else:
+            raise ValueError(f'imgs type error: {type(imgs)}')
+        y = _imgs.astype(np.float32) / 255.
+        y = np.clip(y, 0, 1)  # clip to 0~1
+        y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+        y = y.cuda(self.device_id)
+        return y
+    def extract_feature_3d(self, x: torch.Tensor) -> torch.Tensor:
+        """ get the appearance feature of the image by F
+        x: Bx3xHxW, normalized to 0~1
+        """
+        with torch.no_grad():
+            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision):
+                feature_3d = self.appearance_feature_extractor(x)
+        return feature_3d.float()
+    def get_kp_info(self, x: torch.Tensor, **kwargs) -> dict:
+        """ get the implicit keypoint information
+        x: Bx3xHxW, normalized to 0~1
+        flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
+        return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
+        """
+        with torch.no_grad():
+            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision):
+                kp_info = self.motion_extractor(x)
+            if self.cfg.flag_use_half_precision:
+                # float the dict
+                for k, v in kp_info.items():
+                    if isinstance(v, torch.Tensor):
+                        kp_info[k] = v.float()
+        flag_refine_info: bool = kwargs.get('flag_refine_info', True)
+        if flag_refine_info:
+            bs = kp_info['kp'].shape[0]
+            kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None]  # Bx1
+            kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None]  # Bx1
+            kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None]  # Bx1
+            kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3)  # BxNx3
+            kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3)  # BxNx3
+        return kp_info
+    def get_pose_dct(self, kp_info: dict) -> dict:
+        pose_dct = dict(
+            pitch=headpose_pred_to_degree(kp_info['pitch']).item(),
+            yaw=headpose_pred_to_degree(kp_info['yaw']).item(),
+            roll=headpose_pred_to_degree(kp_info['roll']).item(),
+        )
+        return pose_dct
+    def get_fs_and_kp_info(self, source_prepared, driving_first_frame):
+        # get the canonical keypoints of source image by M
+        source_kp_info = self.get_kp_info(source_prepared, flag_refine_info=True)
+        source_rotation = get_rotation_matrix(source_kp_info['pitch'], source_kp_info['yaw'], source_kp_info['roll'])
+        # get the canonical keypoints of first driving frame by M
+        driving_first_frame_kp_info = self.get_kp_info(driving_first_frame, flag_refine_info=True)
+        driving_first_frame_rotation = get_rotation_matrix(
+            driving_first_frame_kp_info['pitch'],
+            driving_first_frame_kp_info['yaw'],
+            driving_first_frame_kp_info['roll']
+        )
+        # get feature volume by F
+        source_feature_3d = self.extract_feature_3d(source_prepared)
+        return source_kp_info, source_rotation, source_feature_3d, driving_first_frame_kp_info, driving_first_frame_rotation
+    def transform_keypoint(self, kp_info: dict):
+        """
+        transform the implicit keypoints with the pose, shift, and expression deformation
+        kp: BxNx3
+        """
+        kp = kp_info['kp']    # (bs, k, 3)
+        pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
+        t, exp = kp_info['t'], kp_info['exp']
+        scale = kp_info['scale']
+        pitch = headpose_pred_to_degree(pitch)
+        yaw = headpose_pred_to_degree(yaw)
+        roll = headpose_pred_to_degree(roll)
+        bs = kp.shape[0]
+        if kp.ndim == 2:
+            num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+        else:
+            num_kp = kp.shape[1]  # Bxnum_kpx3
+        rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3)
+        # Eqn.2: s * (R * x_c,s + exp) + t
+        kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
+        kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+        kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+        return kp_transformed
+    def retarget_eye(self, kp_source: torch.Tensor, eye_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        eye_close_ratio: Bx3
+        Return: Bx(3*num_kp+2)
+        """
+        feat_eye = concat_feat(kp_source, eye_close_ratio)
+        with torch.no_grad():
+            delta = self.stitching_retargeting_module['eye'](feat_eye)
+        return delta
+    def retarget_lip(self, kp_source: torch.Tensor, lip_close_ratio: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        lip_close_ratio: Bx2
+        """
+        feat_lip = concat_feat(kp_source, lip_close_ratio)
+        with torch.no_grad():
+            delta = self.stitching_retargeting_module['lip'](feat_lip)
+        return delta
+    def stitch(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """
+        kp_source: BxNx3
+        kp_driving: BxNx3
+        Return: Bx(3*num_kp+2)
+        """
+        feat_stiching = concat_feat(kp_source, kp_driving)
+        with torch.no_grad():
+            delta = self.stitching_retargeting_module['stitching'](feat_stiching)
+        return delta
+    def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """ conduct the stitching
+        kp_source: Bxnum_kpx3
+        kp_driving: Bxnum_kpx3
+        """
+        if self.stitching_retargeting_module is not None:
+            bs, num_kp = kp_source.shape[:2]
+            kp_driving_new = kp_driving.clone()
+            delta = self.stitch(kp_source, kp_driving_new)
+            delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+            delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2
+            kp_driving_new += delta_exp
+            kp_driving_new[..., :2] += delta_tx_ty
+            return kp_driving_new
+        return kp_driving
+    def warp_decode(self, feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """ get the image after the warping of the implicit keypoints
+        feature_3d: Bx32x16x64x64, feature volume
+        kp_source: BxNx3
+        kp_driving: BxNx3
+        """
+        # The line 18 in Algorithm 1: D(W(f_s; x_s, x′_d,i)）
+        with torch.no_grad():
+            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=self.cfg.flag_use_half_precision):
+                # get decoder input
+                ret_dct = self.warping_module(feature_3d, kp_source=kp_source, kp_driving=kp_driving)
+                # decode
+                ret_dct['out'] = self.spade_generator(feature=ret_dct['out'])
+            # float the dict
+            if self.cfg.flag_use_half_precision:
+                for k, v in ret_dct.items():
+                    if isinstance(v, torch.Tensor):
+                        ret_dct[k] = v.float()
+        return ret_dct
+    def parse_output(self, out: torch.Tensor) -> np.ndarray:
+        """ construct the output as standard
+        return: 1xHxWx3, uint8
+        """
+        out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1])  # 1x3xHxW -> 1xHxWx3
+        out = np.clip(out, 0, 1)  # clip to 0~1
+        out = np.clip(out * 255, 0, 255).astype(np.uint8)  # 0~1 -> 0~255
+        return out
+    def calc_retargeting_ratio(self, source_lmk, driving_lmk_lst):
+        input_eye_ratio_lst = []
+        input_lip_ratio_lst = []
+        for lmk in driving_lmk_lst:
+            # for eyes retargeting
+            input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
+            # for lip retargeting
+            input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
+        return input_eye_ratio_lst, input_lip_ratio_lst
+    def calc_combined_eye_ratio(self, input_eye_ratio, source_lmk):
+        eye_close_ratio = calc_eye_close_ratio(source_lmk[None])
+        eye_close_ratio_tensor = torch.from_numpy(eye_close_ratio).float().cuda(self.device_id)
+        input_eye_ratio_tensor = torch.Tensor([input_eye_ratio[0][0]]).reshape(1, 1).cuda(self.device_id)
+        # [c_s,eyes, c_d,eyes,i]
+        combined_eye_ratio_tensor = torch.cat([eye_close_ratio_tensor, input_eye_ratio_tensor], dim=1)
+        return combined_eye_ratio_tensor
+    def calc_combined_lip_ratio(self, input_lip_ratio, source_lmk):
+        lip_close_ratio = calc_lip_close_ratio(source_lmk[None])
+        lip_close_ratio_tensor = torch.from_numpy(lip_close_ratio).float().cuda(self.device_id)
+        # [c_s,lip, c_d,lip,i]
+        input_lip_ratio_tensor = torch.Tensor([input_lip_ratio[0]]).cuda(self.device_id)
+        if input_lip_ratio_tensor.shape != [1, 1]:
+            input_lip_ratio_tensor = input_lip_ratio_tensor.reshape(1, 1)
+        combined_lip_ratio_tensor = torch.cat([lip_close_ratio_tensor, input_lip_ratio_tensor], dim=1)
+        return combined_lip_ratio_tensor

src/modules/__init__.py ADDED Viewed

File without changes

src/modules/appearance_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# coding: utf-8
+"""
+Appearance extractor(F) defined in paper, which maps the source image s to a 3D appearance feature volume.
+"""
+import torch
+from torch import nn
+from .util import SameBlock2d, DownBlock2d, ResBlock3d
+class AppearanceFeatureExtractor(nn.Module):
+    def __init__(self, image_channel, block_expansion, num_down_blocks, max_features, reshape_channel, reshape_depth, num_resblocks):
+        super(AppearanceFeatureExtractor, self).__init__()
+        self.image_channel = image_channel
+        self.block_expansion = block_expansion
+        self.num_down_blocks = num_down_blocks
+        self.max_features = max_features
+        self.reshape_channel = reshape_channel
+        self.reshape_depth = reshape_depth
+        self.first = SameBlock2d(image_channel, block_expansion, kernel_size=(3, 3), padding=(1, 1))
+        down_blocks = []
+        for i in range(num_down_blocks):
+            in_features = min(max_features, block_expansion * (2 ** i))
+            out_features = min(max_features, block_expansion * (2 ** (i + 1)))
+            down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1)))
+        self.down_blocks = nn.ModuleList(down_blocks)
+        self.second = nn.Conv2d(in_channels=out_features, out_channels=max_features, kernel_size=1, stride=1)
+        self.resblocks_3d = torch.nn.Sequential()
+        for i in range(num_resblocks):
+            self.resblocks_3d.add_module('3dr' + str(i), ResBlock3d(reshape_channel, kernel_size=3, padding=1))
+    def forward(self, source_image):
+        out = self.first(source_image)  # Bx3x256x256 -> Bx64x256x256
+        for i in range(len(self.down_blocks)):
+            out = self.down_blocks[i](out)
+        out = self.second(out)
+        bs, c, h, w = out.shape  # ->Bx512x64x64
+        f_s = out.view(bs, self.reshape_channel, self.reshape_depth, h, w)  # ->Bx32x16x64x64
+        f_s = self.resblocks_3d(f_s)  # ->Bx32x16x64x64
+        return f_s

src/modules/convnextv2.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# coding: utf-8
+"""
+This moudle is adapted to the ConvNeXtV2 version for the extraction of implicit keypoints, poses, and expression deformation.
+"""
+import torch
+import torch.nn as nn
+# from timm.models.layers import trunc_normal_, DropPath
+from .util import LayerNorm, DropPath, trunc_normal_, GRN
+__all__ = ['convnextv2_tiny']
+class Block(nn.Module):
+    """ ConvNeXtV2 Block.
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+    """
+    def __init__(self, dim, drop_path=0.):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(4 * dim)
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class ConvNeXtV2(nn.Module):
+    """ ConvNeXt V2
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(
+        self,
+        in_chans=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.,
+        **kwargs
+    ):
+        super().__init__()
+        self.depths = depths
+        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer
+        # NOTE: the output semantic items
+        num_bins = kwargs.get('num_bins', 66)
+        num_kp = kwargs.get('num_kp', 24)  # the number of implicit keypoints
+        self.fc_kp = nn.Linear(dims[-1], 3 * num_kp)  # implicit keypoints
+        # print('dims[-1]: ', dims[-1])
+        self.fc_scale = nn.Linear(dims[-1], 1)  # scale
+        self.fc_pitch = nn.Linear(dims[-1], num_bins)  # pitch bins
+        self.fc_yaw = nn.Linear(dims[-1], num_bins)  # yaw bins
+        self.fc_roll = nn.Linear(dims[-1], num_bins)  # roll bins
+        self.fc_t = nn.Linear(dims[-1], 3)  # translation
+        self.fc_exp = nn.Linear(dims[-1], 3 * num_kp)  # expression / delta
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x.mean([-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)
+    def forward(self, x):
+        x = self.forward_features(x)
+        # implicit keypoints
+        kp = self.fc_kp(x)
+        # pose and expression deformation
+        pitch = self.fc_pitch(x)
+        yaw = self.fc_yaw(x)
+        roll = self.fc_roll(x)
+        t = self.fc_t(x)
+        exp = self.fc_exp(x)
+        scale = self.fc_scale(x)
+        ret_dct = {
+            'pitch': pitch,
+            'yaw': yaw,
+            'roll': roll,
+            't': t,
+            'exp': exp,
+            'scale': scale,
+            'kp': kp,  # canonical keypoint
+        }
+        return ret_dct
+def convnextv2_tiny(**kwargs):
+    model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    return model

src/modules/dense_motion.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# coding: utf-8
+"""
+The module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving
+"""
+from torch import nn
+import torch.nn.functional as F
+import torch
+from .util import Hourglass, make_coordinate_grid, kp2gaussian
+class DenseMotionNetwork(nn.Module):
+    def __init__(self, block_expansion, num_blocks, max_features, num_kp, feature_channel, reshape_depth, compress, estimate_occlusion_map=True):
+        super(DenseMotionNetwork, self).__init__()
+        self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp+1)*(compress+1), max_features=max_features, num_blocks=num_blocks)  # ~60+G
+        self.mask = nn.Conv3d(self.hourglass.out_filters, num_kp + 1, kernel_size=7, padding=3)  # 65G! NOTE: computation cost is large
+        self.compress = nn.Conv3d(feature_channel, compress, kernel_size=1)  # 0.8G
+        self.norm = nn.BatchNorm3d(compress, affine=True)
+        self.num_kp = num_kp
+        self.flag_estimate_occlusion_map = estimate_occlusion_map
+        if self.flag_estimate_occlusion_map:
+            self.occlusion = nn.Conv2d(self.hourglass.out_filters*reshape_depth, 1, kernel_size=7, padding=3)
+        else:
+            self.occlusion = None
+    def create_sparse_motions(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 4, 16, 64, 64)
+        identity_grid = make_coordinate_grid((d, h, w), ref=kp_source)  # (16, 64, 64, 3)
+        identity_grid = identity_grid.view(1, 1, d, h, w, 3)  # (1, 1, d=16, h=64, w=64, 3)
+        coordinate_grid = identity_grid - kp_driving.view(bs, self.num_kp, 1, 1, 1, 3)
+        k = coordinate_grid.shape[1]
+        # NOTE: there lacks an one-order flow
+        driving_to_source = coordinate_grid + kp_source.view(bs, self.num_kp, 1, 1, 1, 3)    # (bs, num_kp, d, h, w, 3)
+        # adding background feature
+        identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1, 1)
+        sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1)  # (bs, 1+num_kp, d, h, w, 3)
+        return sparse_motions
+    def create_deformed_feature(self, feature, sparse_motions):
+        bs, _, d, h, w = feature.shape
+        feature_repeat = feature.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp+1, 1, 1, 1, 1, 1)      # (bs, num_kp+1, 1, c, d, h, w)
+        feature_repeat = feature_repeat.view(bs * (self.num_kp+1), -1, d, h, w)                         # (bs*(num_kp+1), c, d, h, w)
+        sparse_motions = sparse_motions.view((bs * (self.num_kp+1), d, h, w, -1))                       # (bs*(num_kp+1), d, h, w, 3)
+        sparse_deformed = F.grid_sample(feature_repeat, sparse_motions, align_corners=False)
+        sparse_deformed = sparse_deformed.view((bs, self.num_kp+1, -1, d, h, w))                        # (bs, num_kp+1, c, d, h, w)
+        return sparse_deformed
+    def create_heatmap_representations(self, feature, kp_driving, kp_source):
+        spatial_size = feature.shape[3:]  # (d=16, h=64, w=64)
+        gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        heatmap = gaussian_driving - gaussian_source  # (bs, num_kp, d, h, w)
+        # adding background feature
+        zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1], spatial_size[2]).type(heatmap.type()).to(heatmap.device)
+        heatmap = torch.cat([zeros, heatmap], dim=1)
+        heatmap = heatmap.unsqueeze(2)         # (bs, 1+num_kp, 1, d, h, w)
+        return heatmap
+    def forward(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 32, 16, 64, 64)
+        feature = self.compress(feature)  # (bs, 4, 16, 64, 64)
+        feature = self.norm(feature)  # (bs, 4, 16, 64, 64)
+        feature = F.relu(feature)  # (bs, 4, 16, 64, 64)
+        out_dict = dict()
+        # 1. deform 3d feature
+        sparse_motion = self.create_sparse_motions(feature, kp_driving, kp_source)  # (bs, 1+num_kp, d, h, w, 3)
+        deformed_feature = self.create_deformed_feature(feature, sparse_motion)  # (bs, 1+num_kp, c=4, d=16, h=64, w=64)
+        # 2. (bs, 1+num_kp, d, h, w)
+        heatmap = self.create_heatmap_representations(deformed_feature, kp_driving, kp_source)  # (bs, 1+num_kp, 1, d, h, w)
+        input = torch.cat([heatmap, deformed_feature], dim=2)  # (bs, 1+num_kp, c=5, d=16, h=64, w=64)
+        input = input.view(bs, -1, d, h, w)  # (bs, (1+num_kp)*c=105, d=16, h=64, w=64)
+        prediction = self.hourglass(input)
+        mask = self.mask(prediction)
+        mask = F.softmax(mask, dim=1)  # (bs, 1+num_kp, d=16, h=64, w=64)
+        out_dict['mask'] = mask
+        mask = mask.unsqueeze(2)                                   # (bs, num_kp+1, 1, d, h, w)
+        sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4)    # (bs, num_kp+1, 3, d, h, w)
+        deformation = (sparse_motion * mask).sum(dim=1)            # (bs, 3, d, h, w)  mask take effect in this place
+        deformation = deformation.permute(0, 2, 3, 4, 1)           # (bs, d, h, w, 3)
+        out_dict['deformation'] = deformation
+        if self.flag_estimate_occlusion_map:
+            bs, _, d, h, w = prediction.shape
+            prediction_reshape = prediction.view(bs, -1, h, w)
+            occlusion_map = torch.sigmoid(self.occlusion(prediction_reshape))  # Bx1x64x64
+            out_dict['occlusion_map'] = occlusion_map
+        return out_dict