ameerazam08 commited on Feb 2

Commit

a5c5b03

•

1 Parent(s): fbeb913

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
.gitignore +199 -0
README-zh.md +137 -0
README.md +137 -0
checkpoints/.gitkeep +0 -0
data_gen/eg3d/convert_to_eg3d_convention.py +146 -0
data_gen/runs/binarizer_nerf.py +335 -0
data_gen/runs/nerf/process_guide.md +49 -0
data_gen/runs/nerf/run.sh +51 -0
data_gen/utils/mp_feature_extractors/face_landmarker.py +130 -0
data_gen/utils/mp_feature_extractors/face_landmarker.task +3 -0
data_gen/utils/mp_feature_extractors/mp_segmenter.py +274 -0
data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite +3 -0
data_gen/utils/path_converter.py +24 -0
data_gen/utils/process_audio/extract_hubert.py +95 -0
data_gen/utils/process_audio/extract_mel_f0.py +148 -0
data_gen/utils/process_audio/resample_audio_to_16k.py +49 -0
data_gen/utils/process_image/extract_lm2d.py +197 -0
data_gen/utils/process_image/extract_segment_imgs.py +114 -0
data_gen/utils/process_image/fit_3dmm_landmark.py +369 -0
data_gen/utils/process_video/euler2quaterion.py +35 -0
data_gen/utils/process_video/extract_blink.py +50 -0
data_gen/utils/process_video/extract_lm2d.py +164 -0
data_gen/utils/process_video/extract_segment_imgs.py +500 -0
data_gen/utils/process_video/fit_3dmm_landmark.py +565 -0
data_gen/utils/process_video/inpaint_torso_imgs.py +193 -0
data_gen/utils/process_video/resample_video_to_25fps_resize_to_512.py +87 -0
data_gen/utils/process_video/split_video_to_imgs.py +53 -0
data_util/face3d_helper.py +309 -0
deep_3drecon/BFM/.gitkeep +0 -0
deep_3drecon/bfm_left_eye_faces.npy +3 -0
deep_3drecon/bfm_right_eye_faces.npy +3 -0
deep_3drecon/deep_3drecon_models/bfm.py +426 -0
deep_3drecon/ncc_code.npy +3 -0
deep_3drecon/secc_renderer.py +78 -0
deep_3drecon/util/mesh_renderer.py +131 -0
docs/prepare_env/install_guide-zh.md +35 -0
docs/prepare_env/install_guide.md +34 -0
docs/prepare_env/requirements.txt +75 -0
inference/app_real3dportrait.py +244 -0
inference/edit_secc.py +147 -0
inference/infer_utils.py +154 -0
inference/real3d_infer.py +542 -0
insta.sh +18 -0
modules/audio2motion/cnn_models.py +359 -0
modules/audio2motion/flow_base.py +838 -0
modules/audio2motion/multi_length_disc.py +340 -0
modules/audio2motion/transformer_base.py +988 -0
modules/audio2motion/transformer_models.py +208 -0
modules/audio2motion/utils.py +29 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_gen/utils/mp_feature_extractors/face_landmarker.task filter=lfs diff=lfs merge=lfs -text
+pytorch3d/.github/bundle_adjust.gif filter=lfs diff=lfs merge=lfs -text
+pytorch3d/.github/camera_position_teapot.gif filter=lfs diff=lfs merge=lfs -text
+pytorch3d/.github/fit_nerf.gif filter=lfs diff=lfs merge=lfs -text
+pytorch3d/.github/fit_textured_volume.gif filter=lfs diff=lfs merge=lfs -text
+pytorch3d/.github/implicitron_config.gif filter=lfs diff=lfs merge=lfs -text
+pytorch3d/.github/nerf_project_logo.gif filter=lfs diff=lfs merge=lfs -text
+pytorch3d/docs/notes/assets/batch_modes.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,199 @@

+# big files
+data_util/face_tracking/3DMM/01_MorphableModel.mat
+data_util/face_tracking/3DMM/3DMM_info.npy
+!/deep_3drecon/BFM/.gitkeep
+deep_3drecon/BFM/Exp_Pca.bin
+deep_3drecon/BFM/01_MorphableModel.mat
+deep_3drecon/BFM/BFM_model_front.mat
+deep_3drecon/network/FaceReconModel.pb
+deep_3drecon/checkpoints/*
+.vscode
+### Project ignore
+/checkpoints/*
+!/checkpoints/.gitkeep
+/data/*
+!/data/.gitkeep
+infer_out
+rsync
+.idea
+.DS_Store
+bak
+tmp
+*.tar.gz
+mos
+nbs
+/configs_usr/*
+!/configs_usr/.gitkeep
+/egs_usr/*
+!/egs_usr/.gitkeep
+/rnnoise
+#/usr/*
+#!/usr/.gitkeep
+scripts_usr
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+data_util/deepspeech_features/deepspeech-0.9.2-models.pbmm
+deep_3drecon/mesh_renderer/bazel-bin
+deep_3drecon/mesh_renderer/bazel-mesh_renderer
+deep_3drecon/mesh_renderer/bazel-out
+deep_3drecon/mesh_renderer/bazel-testlogs
+.nfs*
+infer_outs/*
+*.pth
+venv_113/*
+*.pt
+experiments/trials
+flame_3drecon/*
+temp/
+/kill.sh
+/datasets
+data_util/imagenet_classes.txt
+process_data_May.sh
+/env_prepare_reproduce.md
+/my_debug.py
+utils/metrics/shape_predictor_68_face_landmarks.dat
+*.mp4
+_torchshow/
+*.png
+*.jpg
+*.mrc
+deep_3drecon/BFM/BFM_exp_idx.mat
+deep_3drecon/BFM/BFM_front_idx.mat
+deep_3drecon/BFM/facemodel_info.mat
+deep_3drecon/BFM/index_mp468_from_mesh35709.npy
+deep_3drecon/BFM/mediapipe_in_bfm53201.npy
+deep_3drecon/BFM/std_exp.txt
+!data/raw/examples/*

README-zh.md ADDED Viewed

	@@ -0,0 +1,137 @@

+# Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis | ICLR 2024 Spotlight
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-%3CCOLOR%3E.svg)](https://arxiv.org/abs/2401.08503)| [![GitHub Stars](https://img.shields.io/github/stars/yerfor/Real3DPortrait
+)](https://github.com/yerfor/Real3DPortrait) | [English Readme](./README.md)
+这个仓库是Real3D-Portrait的官方PyTorch实现, 用于实现单参考图(one-shot)、高视频真实度(video reality)的虚拟人视频合成。您可以访问我们的[项目页面](https://real3dportrait.github.io/)以观看Demo视频, 阅读我们的[论文](https://arxiv.org/pdf/2401.08503.pdf)以了解技术细节。
+<p align="center">
+    <br>
+    <img src="assets/real3dportrait.png" width="100%"/>
+    <br>
+</p>
+# 快速上手！
+## 安装环境
+请参照[环境配置文档](docs/prepare_env/install_guide-zh.md)，配置Conda环境`real3dportrait`
+## 下载预训练与第三方模型
+### 3DMM BFM模型
+下载3DMM BFM模型：[Google Drive](https://drive.google.com/drive/folders/1o4t5YIw7w4cMUN4bgU9nPf6IyWVG1bEk?usp=sharing) 或 [BaiduYun Disk](https://pan.baidu.com/s/1aqv1z_qZ23Vp2VP4uxxblQ?pwd=m9q5 ) 提取码: m9q5
+下载完成后，放置全部的文件到`deep_3drecon/BFM`里，文件结构如下：
+```
+deep_3drecon/BFM/
+├── 01_MorphableModel.mat
+├── BFM_exp_idx.mat
+├── BFM_front_idx.mat
+├── BFM_model_front.mat
+├── Exp_Pca.bin
+├── facemodel_info.mat
+├── index_mp468_from_mesh35709.npy
+├── mediapipe_in_bfm53201.npy
+└── std_exp.txt
+```
+### 预训练模型
+下载预训练的Real3D-Portrait：[Google Drive](https://drive.google.com/drive/folders/1MAveJf7RvJ-Opg1f5qhLdoRoC_Gc6nD9?usp=sharing) 或 [BaiduYun Disk](https://pan.baidu.com/s/1Mjmbn0UtA1Zm9owZ7zWNgQ?pwd=6x4f ) 提取码: 6x4f
+下载完成后，放置全部的文件到`checkpoints`里并解压，文件结构如下：
+```
+checkpoints/
+├── 240126_real3dportrait_orig
+│   ├── audio2secc_vae
+│   │   ├── config.yaml
+│   │   └── model_ckpt_steps_400000.ckpt
+│   └── secc2plane_torso_orig
+│       ├── config.yaml
+│       └── model_ckpt_steps_100000.ckpt
+└── pretrained_ckpts
+    └── mit_b0.pth
+```
+## 推理测试
+我们目前提供了**命令行（CLI）**与**Gradio WebUI**推理方式，并将在未来提供Google Colab方式。我们同时支持音频驱动（Audio-Driven）与视频驱动（Video-Driven）：
+- 音频驱动场景下，需要至少提供`source image`与`driving audio`
+- 视频驱动场景下，需要至少提供`source image`与`driving expression video`
+### Gradio WebUI推理
+启动Gradio WebUI，按照提示上传素材，点击`Generate`按钮即可推理：
+```bash
+python inference/app_real3dportrait.py
+```
+### 命令行推理
+首先，切换至项目根目录并启用Conda环境：
+```bash
+cd <Real3DPortraitRoot>
+conda activate real3dportrait
+export PYTHON_PATH=./
+```
+音频驱动场景下，需要至少提供source image与driving audio，推理指令：
+```bash
+python inference/real3d_infer.py \
+--src_img <PATH_TO_SOURCE_IMAGE> \
+--drv_aud <PATH_TO_AUDIO> \
+--drv_pose <PATH_TO_POSE_VIDEO, OPTIONAL> \
+--bg_img <PATH_TO_BACKGROUND_IMAGE, OPTIONAL> \
+--out_name <PATH_TO_OUTPUT_VIDEO, OPTIONAL>
+```
+视频驱动场景下，需要至少提供source image与driving expression video（作为drv_aud参数），推理指令：
+```bash
+python inference/real3d_infer.py \
+--src_img <PATH_TO_SOURCE_IMAGE> \
+--drv_aud <PATH_TO_EXP_VIDEO> \
+--drv_pose <PATH_TO_POSE_VIDEO, OPTIONAL> \
+--bg_img <PATH_TO_BACKGROUND_IMAGE, OPTIONAL> \
+--out_name <PATH_TO_OUTPUT_VIDEO, OPTIONAL>
+```
+一些可选参数注释：
+- `--drv_pose` 指定时提供了运动pose信息，不指定则为静态运动
+- `--bg_img` 指定时提供了背景信息，不指定则为source image提取的背景
+- `--mouth_amp` 嘴部张幅参数，值越大张幅越大
+- `--map_to_init_pose` 值为`True`时，首帧的pose将被映射到source pose，后续帧也作相同变换
+- `--temperature` 代表audio2motion的采样温度，值越大结果越多样，但同时精确度越低
+- `--out_name` 不指定时，结果将保存在`infer_out/tmp/`中
+- `--out_mode` 值为`final`时，只输出说话人视频；值为`concat_debug`时，同时输出一些可视化的中间结果
+指令示例：
+```bash
+python inference/real3d_infer.py \
+--src_img data/raw/examples/Macron.png \
+--drv_aud data/raw/examples/Obama_5s.wav \
+--drv_pose data/raw/examples/May_5s.mp4 \
+--bg_img data/raw/examples/bg.png \
+--out_name output.mp4 \
+--out_mode concat_debug
+```
+## ToDo
+- [x] **Release Pre-trained weights of Real3D-Portrait.**
+- [x] **Release Inference Code of Real3D-Portrait.**
+- [x] **Release Gradio Demo of Real3D-Portrait..**
+- [ ] **Release Google Colab of Real3D-Portrait..**
+- [ ] **Release Training Code of Real3D-Portrait.**
+# 引用我们
+如果这个仓库对你有帮助，请考虑引用我们��工作：
+```
+@article{ye2024real3d,
+  title={Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis},
+  author={Ye, Zhenhui and Zhong, Tianyun and Ren, Yi and Yang, Jiaqi and Li, Weichuang and Huang, Jiawei and Jiang, Ziyue and He, Jinzheng and Huang, Rongjie and Liu, Jinglin and others},
+  journal={arXiv preprint arXiv:2401.08503},
+  year={2024}
+}
+@article{ye2023geneface++,
+  title={GeneFace++: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation},
+  author={Ye, Zhenhui and He, Jinzheng and Jiang, Ziyue and Huang, Rongjie and Huang, Jiawei and Liu, Jinglin and Ren, Yi and Yin, Xiang and Ma, Zejun and Zhao, Zhou},
+  journal={arXiv preprint arXiv:2305.00787},
+  year={2023}
+}
+@article{ye2023geneface,
+  title={GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis},
+  author={Ye, Zhenhui and Jiang, Ziyue and Ren, Yi and Liu, Jinglin and He, Jinzheng and Zhao, Zhou},
+  journal={arXiv preprint arXiv:2301.13430},
+  year={2023}
+}
+```

README.md ADDED Viewed

	@@ -0,0 +1,137 @@

+# Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis | ICLR 2024 Spotlight
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-%3CCOLOR%3E.svg)](https://arxiv.org/abs/2401.08503)| [![GitHub Stars](https://img.shields.io/github/stars/yerfor/Real3DPortrait
+)](https://github.com/yerfor/Real3DPortrait) | [中文文档](./README-zh.md)
+This is the official repo of Real3D-Portrait with Pytorch implementation, for one-shot and high video reality talking portrait synthesis. You can visit our [Demo Page](https://real3dportrait.github.io/) for watching demo videos, and read our [Paper](https://arxiv.org/pdf/2401.08503.pdf) for technical details.
+<p align="center">
+    <br>
+    <img src="assets/real3dportrait.png" width="100%"/>
+    <br>
+</p>
+# Quick Start!
+## Environment Installation
+Please refer to [Installation Guide](docs/prepare_env/install_guide.md), prepare a Conda environment `real3dportrait`.
+## Download Pre-trained & Third-Party Models
+### 3DMM BFM Model
+Download 3DMM BFM Model from [Google Drive](https://drive.google.com/drive/folders/1o4t5YIw7w4cMUN4bgU9nPf6IyWVG1bEk?usp=sharing) or [BaiduYun Disk](https://pan.baidu.com/s/1aqv1z_qZ23Vp2VP4uxxblQ?pwd=m9q5 ) with Password m9q5.
+Put all the files in `deep_3drecon/BFM`, the file structure will be like this:
+```
+deep_3drecon/BFM/
+├── 01_MorphableModel.mat
+├── BFM_exp_idx.mat
+├── BFM_front_idx.mat
+├── BFM_model_front.mat
+├── Exp_Pca.bin
+├── facemodel_info.mat
+├── index_mp468_from_mesh35709.npy
+├── mediapipe_in_bfm53201.npy
+└── std_exp.txt
+```
+### Pre-trained Real3D-Portrait
+Download Pre-trained Real3D-Portrait：[Google Drive](https://drive.google.com/drive/folders/1MAveJf7RvJ-Opg1f5qhLdoRoC_Gc6nD9?usp=sharing) or [BaiduYun Disk](https://pan.baidu.com/s/1Mjmbn0UtA1Zm9owZ7zWNgQ?pwd=6x4f ) with Password 6x4f
+Put the zip files in `checkpoints` and unzip them, the file structure will be like this:
+```
+checkpoints/
+├── 240126_real3dportrait_orig
+│   ├── audio2secc_vae
+│   │   ├── config.yaml
+│   │   └── model_ckpt_steps_400000.ckpt
+│   └── secc2plane_torso_orig
+│       ├── config.yaml
+│       └── model_ckpt_steps_100000.ckpt
+└── pretrained_ckpts
+    └── mit_b0.pth
+```
+## Inference
+Currently, we provide **CLI** and **Gradio WebUI** for inference, and Google Colab will be provided in the future. We support both Audio-Driven and Video-Driven methods:
+- For audio-driven, at least prepare `source image` and `driving audio`
+- For video-driven, at least prepare `source image` and `driving expression video`
+### Gradio WebUI
+Run Gradio WebUI demo, upload resouces in webpage，click `Generate` button to inference：
+```bash
+python inference/app_real3dportrait.py
+```
+### CLI Inference
+Firstly, switch to project folder and activate conda environment:
+```bash
+cd <Real3DPortraitRoot>
+conda activate real3dportrait
+export PYTHON_PATH=./
+```
+For audio-driven, provide source image and driving audio:
+```bash
+python inference/real3d_infer.py \
+--src_img <PATH_TO_SOURCE_IMAGE> \
+--drv_aud <PATH_TO_AUDIO> \
+--drv_pose <PATH_TO_POSE_VIDEO, OPTIONAL> \
+--bg_img <PATH_TO_BACKGROUND_IMAGE, OPTIONAL> \
+--out_name <PATH_TO_OUTPUT_VIDEO, OPTIONAL>
+```
+For video-driven, provide source image and driving expression video(as `--drv_aud` parameter):
+```bash
+python inference/real3d_infer.py \
+--src_img <PATH_TO_SOURCE_IMAGE> \
+--drv_aud <PATH_TO_EXP_VIDEO> \
+--drv_pose <PATH_TO_POSE_VIDEO, OPTIONAL> \
+--bg_img <PATH_TO_BACKGROUND_IMAGE, OPTIONAL> \
+--out_name <PATH_TO_OUTPUT_VIDEO, OPTIONAL>
+```
+Some optional parameters：
+- `--drv_pose` provide motion pose information, default to be static poses
+- `--bg_img` provide background information, default to be image extracted from source
+- `--mouth_amp` mouth amplitude, higher value leads to wider mouth
+- `--map_to_init_pose` when set to `True`, the initial pose will be mapped to source pose, and other poses will be equally transformed
+- `--temperature` stands for the sampling temperature of audio2motion, higher for more diverse results at the expense of lower accuracy
+- `--out_name` When not assigned, the results will be stored at `infer_out/tmp/`.
+- `--out_mode` When `final`, only outputs the final result; when `concat_debug`, also outputs visualization of several intermediate process.
+Commandline example:
+```bash
+python inference/real3d_infer.py \
+--src_img data/raw/examples/Macron.png \
+--drv_aud data/raw/examples/Obama_5s.wav \
+--drv_pose data/raw/examples/May_5s.mp4 \
+--bg_img data/raw/examples/bg.png \
+--out_name output.mp4 \
+--out_mode concat_debug
+```
+# ToDo
+- [x] **Release Pre-trained weights of Real3D-Portrait.**
+- [x] **Release Inference Code of Real3D-Portrait.**
+- [x] **Release Gradio Demo of Real3D-Portrait..**
+- [ ] **Release Google Colab of Real3D-Portrait..**
+- [ ] **Release Training Code of Real3D-Portrait.**
+# Citation
+If you found this repo helpful to your work, please consider cite us:
+```
+@article{ye2024real3d,
+  title={Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis},
+  author={Ye, Zhenhui and Zhong, Tianyun and Ren, Yi and Yang, Jiaqi and Li, Weichuang and Huang, Jiawei and Jiang, Ziyue and He, Jinzheng and Huang, Rongjie and Liu, Jinglin and others},
+  journal={arXiv preprint arXiv:2401.08503},
+  year={2024}
+}
+@article{ye2023geneface++,
+  title={GeneFace++: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation},
+  author={Ye, Zhenhui and He, Jinzheng and Jiang, Ziyue and Huang, Rongjie and Huang, Jiawei and Liu, Jinglin and Ren, Yi and Yin, Xiang and Ma, Zejun and Zhao, Zhou},
+  journal={arXiv preprint arXiv:2305.00787},
+  year={2023}
+}
+@article{ye2023geneface,
+  title={GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis},
+  author={Ye, Zhenhui and Jiang, Ziyue and Ren, Yi and Liu, Jinglin and He, Jinzheng and Zhao, Zhou},
+  journal={arXiv preprint arXiv:2301.13430},
+  year={2023}
+}
+```

checkpoints/.gitkeep ADDED Viewed

File without changes

data_gen/eg3d/convert_to_eg3d_convention.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import numpy as np
+import torch
+import copy
+from utils.commons.tensor_utils import convert_to_tensor, convert_to_np
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+def _fix_intrinsics(intrinsics):
+    """
+    intrinsics: [3,3], not batch-wise
+    """
+    # unnormalized                                normalized
+    # [[ f_x, s=0,    x_0]             [[ f_x/size_x,   s=0,            x_0/size_x=0.5]
+    #  [ 0,   f_y,  y_0]      ->      [ 0,            f_y/size_y,   y_0/size_y=0.5]
+    #  [ 0,   0,    1  ]]             [ 0,            0,            1         ]]
+    intrinsics = np.array(intrinsics).copy()
+    assert intrinsics.shape == (3, 3), intrinsics
+    intrinsics[0,0] = 2985.29/700
+    intrinsics[1,1] = 2985.29/700
+    intrinsics[0,2] = 1/2
+    intrinsics[1,2] = 1/2
+    assert intrinsics[0,1] == 0
+    assert intrinsics[2,2] == 1
+    assert intrinsics[1,0] == 0
+    assert intrinsics[2,0] == 0
+    assert intrinsics[2,1] == 0
+    return intrinsics
+# Used in original submission
+def _fix_pose_orig(pose):
+    """
+    pose: [4,4], not batch-wise
+    """
+    pose = np.array(pose).copy()
+    location = pose[:3, 3]
+    radius = np.linalg.norm(location)
+    pose[:3, 3] = pose[:3, 3]/radius * 2.7
+    return pose
+def get_eg3d_convention_camera_pose_intrinsic(item):
+    """
+    item: a dict during binarize
+    """
+    if item['euler'].ndim == 1:
+        angle = convert_to_tensor(copy.copy(item['euler']))
+        trans = copy.deepcopy(item['trans'])
+        # handle the difference of euler axis between eg3d and ours
+        # see data_gen/process_ffhq_for_eg3d/transplant_eg3d_ckpt_into_our_convention.ipynb
+        # angle += torch.tensor([0, 3.1415926535, 3.1415926535], device=angle.device)
+        R = ParametricFaceModel.compute_rotation(angle.unsqueeze(0))[0].cpu().numpy()
+        trans[2] += -10
+        c = -np.dot(R, trans)
+        pose = np.eye(4)
+        pose[:3,:3] = R
+        c *= 0.27 # normalize camera radius
+        c[1] += 0.006 # additional offset used in submission
+        c[2] += 0.161 # additional offset used in submission
+        pose[0,3] = c[0]
+        pose[1,3] = c[1]
+        pose[2,3] = c[2]
+        focal = 2985.29 # = 1015*1024/224*(300/466.285),
+        # todo： 如果修改了fit 3dmm阶段的camera intrinsic，这里也要跟着改
+        pp = 512#112
+        w = 1024#224
+        h = 1024#224
+        K = np.eye(3)
+        K[0][0] = focal
+        K[1][1] = focal
+        K[0][2] = w/2.0
+        K[1][2] = h/2.0
+        convention_K = _fix_intrinsics(K)
+        Rot = np.eye(3)
+        Rot[0, 0] = 1
+        Rot[1, 1] = -1
+        Rot[2, 2] = -1
+        pose[:3, :3] = np.dot(pose[:3, :3], Rot) # permute axes
+        convention_pose = _fix_pose_orig(pose)
+        item['c2w'] = pose
+        item['convention_c2w'] = convention_pose
+        item['intrinsics'] = convention_K
+        return item
+    else:
+        num_samples = len(item['euler'])
+        eulers_all = convert_to_tensor(copy.deepcopy(item['euler'])) # [B, 3]
+        trans_all = copy.deepcopy(item['trans']) # [B, 3]
+        # handle the difference of euler axis between eg3d and ours
+        # see data_gen/process_ffhq_for_eg3d/transplant_eg3d_ckpt_into_our_convention.ipynb
+        # eulers_all += torch.tensor([0, 3.1415926535, 3.1415926535], device=eulers_all.device).unsqueeze(0).repeat([eulers_all.shape[0],1])
+        intrinsics = []
+        poses = []
+        convention_poses = []
+        for i in range(num_samples):
+            angle = eulers_all[i]
+            trans = trans_all[i]
+            R = ParametricFaceModel.compute_rotation(angle.unsqueeze(0))[0].cpu().numpy()
+            trans[2] += -10
+            c = -np.dot(R, trans)
+            pose = np.eye(4)
+            pose[:3,:3] = R
+            c *= 0.27 # normalize camera radius
+            c[1] += 0.006 # additional offset used in submission
+            c[2] += 0.161 # additional offset used in submission
+            pose[0,3] = c[0]
+            pose[1,3] = c[1]
+            pose[2,3] = c[2]
+            focal = 2985.29 # = 1015*1024/224*(300/466.285),
+            # todo： 如果修改了fit 3dmm阶段的camera intrinsic，这里也要跟着改
+            pp = 512#112
+            w = 1024#224
+            h = 1024#224
+            K = np.eye(3)
+            K[0][0] = focal
+            K[1][1] = focal
+            K[0][2] = w/2.0
+            K[1][2] = h/2.0
+            convention_K = _fix_intrinsics(K)
+            intrinsics.append(convention_K)
+            Rot = np.eye(3)
+            Rot[0, 0] = 1
+            Rot[1, 1] = -1
+            Rot[2, 2] = -1
+            pose[:3, :3] = np.dot(pose[:3, :3], Rot)
+            convention_pose = _fix_pose_orig(pose)
+            convention_poses.append(convention_pose)
+            poses.append(pose)
+        intrinsics = np.stack(intrinsics) # [B, 3, 3]
+        poses = np.stack(poses) # [B, 4, 4]
+        convention_poses = np.stack(convention_poses) # [B, 4, 4]
+        item['intrinsics'] = intrinsics
+        item['c2w'] = poses
+        item['convention_c2w'] = convention_poses
+        return item

data_gen/runs/binarizer_nerf.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import os
+import numpy as np
+import math
+import json
+import imageio
+import torch
+import tqdm
+import cv2
+from data_util.face3d_helper import Face3DHelper
+from utils.commons.euler2rot import euler_trans_2_c2w, c2w_to_euler_trans
+from data_gen.utils.process_video.euler2quaterion import euler2quaterion, quaterion2euler
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+def euler2rot(euler_angle):
+    batch_size = euler_angle.shape[0]
+    theta = euler_angle[:, 0].reshape(-1, 1, 1)
+    phi = euler_angle[:, 1].reshape(-1, 1, 1)
+    psi = euler_angle[:, 2].reshape(-1, 1, 1)
+    one = torch.ones(batch_size, 1, 1).to(euler_angle.device)
+    zero = torch.zeros(batch_size, 1, 1).to(euler_angle.device)
+    rot_x = torch.cat((
+        torch.cat((one, zero, zero), 1),
+        torch.cat((zero, theta.cos(), theta.sin()), 1),
+        torch.cat((zero, -theta.sin(), theta.cos()), 1),
+    ), 2)
+    rot_y = torch.cat((
+        torch.cat((phi.cos(), zero, -phi.sin()), 1),
+        torch.cat((zero, one, zero), 1),
+        torch.cat((phi.sin(), zero, phi.cos()), 1),
+    ), 2)
+    rot_z = torch.cat((
+        torch.cat((psi.cos(), -psi.sin(), zero), 1),
+        torch.cat((psi.sin(), psi.cos(), zero), 1),
+        torch.cat((zero, zero, one), 1)
+    ), 2)
+    return torch.bmm(rot_x, torch.bmm(rot_y, rot_z))
+def rot2euler(rot_mat):
+    batch_size = len(rot_mat)
+    # we assert that y in in [-0.5pi, 0.5pi]
+    cos_y = torch.sqrt(rot_mat[:, 1, 2] * rot_mat[:, 1, 2] + rot_mat[:, 2, 2] * rot_mat[:, 2, 2])
+    theta_x = torch.atan2(-rot_mat[:, 1, 2], rot_mat[:, 2, 2])
+    theta_y = torch.atan2(rot_mat[:, 2, 0], cos_y)
+    theta_z = torch.atan2(rot_mat[:, 0, 1], rot_mat[:, 0, 0])
+    euler_angles = torch.zeros([batch_size, 3])
+    euler_angles[:, 0] = theta_x
+    euler_angles[:, 1] = theta_y
+    euler_angles[:, 2] = theta_z
+    return euler_angles
+index_lm68_from_lm468 = [127,234,93,132,58,136,150,176,152,400,379,365,288,361,323,454,356,70,63,105,66,107,336,296,334,293,300,168,197,5,4,75,97,2,326,305,
+                         33,160,158,133,153,144,362,385,387,263,373,380,61,40,37,0,267,270,291,321,314,17,84,91,78,81,13,311,308,402,14,178]
+def plot_lm2d(lm2d):
+    WH = 512
+    img = np.ones([WH, WH, 3], dtype=np.uint8) * 255
+    for i in range(len(lm2d)):
+        x, y = lm2d[i]
+        color = (255,0,0)
+        img = cv2.circle(img, center=(int(x),int(y)), radius=3, color=color, thickness=-1)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+    for i in range(len(lm2d)):
+        x, y = lm2d[i]
+        img = cv2.putText(img, f"{i}", org=(int(x),int(y)), fontFace=font, fontScale=0.3, color=(255,0,0))
+    return img
+def get_face_rect(lms, h, w):
+    """
+    lms: [68, 2]
+    h, w: int
+    return: [4,]
+    """
+    assert len(lms) == 68
+    # min_x, max_x = np.min(lms, 0)[0], np.max(lms, 0)[0]
+    min_x, max_x = np.min(lms[:, 0]), np.max(lms[:, 0])
+    cx = int((min_x+max_x)/2.0)
+    cy = int(lms[27, 1])
+    h_w = int((max_x-cx)*1.5)
+    h_h = int((lms[8, 1]-cy)*1.15)
+    rect_x = cx - h_w
+    rect_y = cy - h_h
+    if rect_x < 0:
+        rect_x = 0
+    if rect_y < 0:
+        rect_y = 0
+    rect_w = min(w-1-rect_x, 2*h_w)
+    rect_h = min(h-1-rect_y, 2*h_h)
+    # rect = np.array((rect_x, rect_y, rect_w, rect_h), dtype=np.int32)
+    # rect = [rect_x, rect_y, rect_w, rect_h]
+    rect = [rect_x, rect_x + rect_w, rect_y, rect_y + rect_h] # min_j,  max_j, min_i, max_i
+    return rect # this x is width, y is height
+def get_lip_rect(lms, h, w):
+    """
+    lms: [68, 2]
+    h, w: int
+    return: [4,]
+    """
+    # this x is width, y is height
+    # for lms, lms[:, 0] is width, lms[:, 1] is height
+    assert len(lms) == 68
+    lips = slice(48, 60)
+    lms = lms[lips]
+    min_x, max_x = np.min(lms[:, 0]), np.max(lms[:, 0])
+    min_y, max_y = np.min(lms[:, 1]), np.max(lms[:, 1])
+    cx = int((min_x+max_x)/2.0)
+    cy = int((min_y+max_y)/2.0)
+    h_w = int((max_x-cx)*1.2)
+    h_h = int((max_y-cy)*1.2)
+    h_w = max(h_w, h_h)
+    h_h = h_w
+    rect_x = cx - h_w
+    rect_y = cy - h_h
+    rect_w = 2*h_w
+    rect_h = 2*h_h
+    if rect_x < 0:
+        rect_x = 0
+    if rect_y < 0:
+        rect_y = 0
+    if rect_x + rect_w > w:
+        rect_x = w - rect_w
+    if rect_y + rect_h > h:
+        rect_y = h - rect_h
+    rect = [rect_x, rect_x + rect_w, rect_y, rect_y + rect_h] # min_j,  max_j, min_i, max_i
+    return rect # this x is width, y is height
+# def get_lip_rect(lms, h, w):
+#     """
+#     lms: [68, 2]
+#     h, w: int
+#     return: [4,]
+#     """
+#     assert len(lms) == 68
+#     lips = slice(48, 60)
+#     # this x is width, y is height
+#     xmin, xmax = int(lms[lips, 1].min()), int(lms[lips, 1].max())
+#     ymin, ymax = int(lms[lips, 0].min()), int(lms[lips, 0].max())
+#     # padding to H == W
+#     cx = (xmin + xmax) // 2
+#     cy = (ymin + ymax) // 2
+#     l = max(xmax - xmin, ymax - ymin) // 2
+#     xmin = max(0, cx - l)
+#     xmax = min(h, cx + l)
+#     ymin = max(0, cy - l)
+#     ymax = min(w, cy + l)
+#     lip_rect = [xmin, xmax, ymin, ymax]
+#     return lip_rect
+def get_win_conds(conds, idx, smo_win_size=8, pad_option='zero'):
+    """
+    conds: [b, t=16, h=29]
+    idx: long, time index of the selected frame
+    """
+    idx = max(0, idx)
+    idx = min(idx, conds.shape[0]-1)
+    smo_half_win_size = smo_win_size//2
+    left_i = idx - smo_half_win_size
+    right_i = idx + (smo_win_size - smo_half_win_size)
+    pad_left, pad_right = 0, 0
+    if left_i < 0:
+        pad_left = -left_i
+        left_i = 0
+    if right_i > conds.shape[0]:
+        pad_right = right_i - conds.shape[0]
+        right_i = conds.shape[0]
+    conds_win = conds[left_i:right_i]
+    if pad_left > 0:
+        if pad_option == 'zero':
+            conds_win = np.concatenate([np.zeros_like(conds_win)[:pad_left], conds_win], axis=0)
+        elif pad_option == 'edge':
+            edge_value = conds[0][np.newaxis, ...]
+            conds_win = np.concatenate([edge_value] * pad_left + [conds_win], axis=0)
+        else:
+            raise NotImplementedError
+    if pad_right > 0:
+        if pad_option == 'zero':
+            conds_win = np.concatenate([conds_win, np.zeros_like(conds_win)[:pad_right]], axis=0)
+        elif pad_option == 'edge':
+            edge_value = conds[-1][np.newaxis, ...]
+            conds_win = np.concatenate([conds_win] + [edge_value] * pad_right , axis=0)
+        else:
+            raise NotImplementedError
+    assert conds_win.shape[0] == smo_win_size
+    return conds_win
+def load_processed_data(processed_dir):
+    # load necessary files
+    background_img_name = os.path.join(processed_dir, "bg.jpg")
+    assert os.path.exists(background_img_name)
+    head_img_dir = os.path.join(processed_dir, "head_imgs")
+    torso_img_dir = os.path.join(processed_dir, "inpaint_torso_imgs")
+    gt_img_dir = os.path.join(processed_dir, "gt_imgs")
+    hubert_npy_name = os.path.join(processed_dir, "aud_hubert.npy")
+    mel_f0_npy_name = os.path.join(processed_dir, "aud_mel_f0.npy")
+    coeff_npy_name = os.path.join(processed_dir, "coeff_fit_mp.npy")
+    lm2d_npy_name = os.path.join(processed_dir, "lms_2d.npy")
+    ret_dict = {}
+    ret_dict['bg_img'] = imageio.imread(background_img_name)
+    ret_dict['H'], ret_dict['W'] = ret_dict['bg_img'].shape[:2]
+    ret_dict['focal'], ret_dict['cx'], ret_dict['cy'] = face_model.focal, face_model.center, face_model.center
+    print("loading lm2d coeff ...")
+    lm2d_arr = np.load(lm2d_npy_name)
+    face_rect_lst = []
+    lip_rect_lst = []
+    for lm2d in lm2d_arr:
+        if len(lm2d) in [468, 478]:
+            lm2d = lm2d[index_lm68_from_lm468]
+        face_rect = get_face_rect(lm2d, ret_dict['H'], ret_dict['W'])
+        lip_rect = get_lip_rect(lm2d, ret_dict['H'], ret_dict['W'])
+        face_rect_lst.append(face_rect)
+        lip_rect_lst.append(lip_rect)
+    face_rects = np.stack(face_rect_lst, axis=0) # [T, 4]
+    print("loading fitted 3dmm coeff ...")
+    coeff_dict = np.load(coeff_npy_name, allow_pickle=True).tolist()
+    identity_arr = coeff_dict['id']
+    exp_arr = coeff_dict['exp']
+    ret_dict['id'] = identity_arr
+    ret_dict['exp'] = exp_arr
+    euler_arr = ret_dict['euler'] = coeff_dict['euler']
+    trans_arr = ret_dict['trans'] = coeff_dict['trans']
+    print("calculating lm3d ...")
+    idexp_lm3d_arr = face3d_helper.reconstruct_idexp_lm3d(torch.from_numpy(identity_arr), torch.from_numpy(exp_arr)).cpu().numpy().reshape([-1, 68*3])
+    len_motion = len(idexp_lm3d_arr)
+    video_idexp_lm3d_mean = idexp_lm3d_arr.mean(axis=0)
+    video_idexp_lm3d_std = idexp_lm3d_arr.std(axis=0)
+    ret_dict['idexp_lm3d'] = idexp_lm3d_arr
+    ret_dict['idexp_lm3d_mean'] = video_idexp_lm3d_mean
+    ret_dict['idexp_lm3d_std'] = video_idexp_lm3d_std
+    # now we convert the euler_trans from deep3d convention to adnerf convention
+    eulers = torch.FloatTensor(euler_arr)
+    trans = torch.FloatTensor(trans_arr)
+    rots = face_model.compute_rotation(eulers) # rotation matrix is a better intermediate for convention-transplan than euler
+    # handle the camera pose to geneface's convention
+    trans[:, 2] = 10 - trans[:, 2] # 抵消fit阶段的to_camera操作，即trans[...,2] = 10 - trans[...,2]
+    rots = rots.permute(0, 2, 1)
+    trans[:, 2] = - trans[:,2] # 因为intrinsic proj不同
+    # below is the NeRF camera preprocessing strategy, see `save_transforms` in data_util/process.py
+    trans = trans / 10.0
+    rots_inv = rots.permute(0, 2, 1)
+    trans_inv = - torch.bmm(rots_inv, trans.unsqueeze(2))
+    pose = torch.eye(4, dtype=torch.float32).unsqueeze(0).repeat([len_motion, 1, 1]) # [T, 4, 4]
+    pose[:, :3, :3] = rots_inv
+    pose[:, :3, 3] = trans_inv[:, :, 0]
+    c2w_transform_matrices = pose.numpy()
+    # process the audio features used for postnet training
+    print("loading hubert ...")
+    hubert_features = np.load(hubert_npy_name)
+    print("loading Mel and F0 ...")
+    mel_f0_features = np.load(mel_f0_npy_name, allow_pickle=True).tolist()
+    ret_dict['hubert'] = hubert_features
+    ret_dict['mel'] = mel_f0_features['mel']
+    ret_dict['f0'] = mel_f0_features['f0']
+    # obtaining train samples
+    frame_indices = list(range(len_motion))
+    num_train = len_motion // 11 * 10
+    train_indices = frame_indices[:num_train]
+    val_indices = frame_indices[num_train:]
+    for split in ['train', 'val']:
+        if split == 'train':
+            indices = train_indices
+            samples = []
+            ret_dict['train_samples'] = samples
+        elif split == 'val':
+            indices = val_indices
+            samples = []
+            ret_dict['val_samples'] = samples
+        for idx in indices:
+            sample = {}
+            sample['idx'] = idx
+            sample['head_img_fname'] = os.path.join(head_img_dir,f"{idx:08d}.png")
+            sample['torso_img_fname'] = os.path.join(torso_img_dir,f"{idx:08d}.png")
+            sample['gt_img_fname'] = os.path.join(gt_img_dir,f"{idx:08d}.jpg")
+            # assert os.path.exists(sample['head_img_fname']) and os.path.exists(sample['torso_img_fname']) and os.path.exists(sample['gt_img_fname'])
+            sample['face_rect'] = face_rects[idx]
+            sample['lip_rect'] = lip_rect_lst[idx]
+            sample['c2w'] = c2w_transform_matrices[idx]
+            samples.append(sample)
+    return ret_dict
+class Binarizer:
+    def __init__(self):
+        self.data_dir = 'data/'
+    def parse(self, video_id):
+        processed_dir = os.path.join(self.data_dir, 'processed/videos', video_id)
+        binary_dir = os.path.join(self.data_dir, 'binary/videos', video_id)
+        out_fname = os.path.join(binary_dir, "trainval_dataset.npy")
+        os.makedirs(binary_dir, exist_ok=True)
+        ret = load_processed_data(processed_dir)
+        mel_name = os.path.join(processed_dir, 'aud_mel_f0.npy')
+        mel_f0_dict = np.load(mel_name, allow_pickle=True).tolist()
+        ret.update(mel_f0_dict)
+        np.save(out_fname, ret, allow_pickle=True)
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--video_id', type=str, default='May', help='')
+    args = parser.parse_args()
+    ### Process Single Long Audio for NeRF dataset
+    video_id = args.video_id
+    face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+                camera_distance=10, focal=1015)
+    face_model.to("cpu")
+    face3d_helper = Face3DHelper()
+    binarizer = Binarizer()
+    binarizer.parse(video_id)
+    print(f"Binarization for {video_id} Done!")

data_gen/runs/nerf/process_guide.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# 温馨提示：第一次执行可以先一步步跑完下面的命令行，把环境跑通后，之后可以直接运行同目录的run.sh，一键完成下面的所有步骤。
+# Step0. 将视频Crop到512x512分辨率，25FPS，确保每一帧都有目标人脸
+```
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 data/raw/videos/${VIDEO_ID}_512.mp4
+mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
+mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4
+```
+# step1: 提取音频特征, 如mel, f0, hubuert, esperanto
+```
+export CUDA_VISIBLE_DEVICES=0
+export VIDEO_ID=May
+mkdir -p data/processed/videos/${VIDEO_ID}
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 data/processed/videos/${VIDEO_ID}/aud.wav
+python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
+python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
+```
+# Step2. 提取图片
+```
+export VIDEO_ID=May
+export CUDA_VISIBLE_DEVICES=0
+mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
+python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
+```
+# Step3. 提取lm2d_mediapipe
+### 提取2D landmark用于之后Fit 3DMM
+### num_workers是本机上的CPU worker数量；total_process是使用的机器数；process_id是本机的编号
+```
+export VIDEO_ID=May
+python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
+```
+# Step3. fit 3dmm
+```
+export VIDEO_ID=May
+export CUDA_VISIBLE_DEVICES=0
+python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset  --debug --id_mode=global
+```
+# Step4. Binarize
+```
+export VIDEO_ID=May
+python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
+```
+可以看到在`data/binary/videos/Mayssss`目录下得到了数据集。

data_gen/runs/nerf/run.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+# usage: CUDA_VISIBLE_DEVICES=0 bash data_gen/runs/nerf/run.sh <VIDEO_ID>
+# please place video to data/raw/videos/${VIDEO_ID}.mp4
+VIDEO_ID=$1
+echo Processing $VIDEO_ID
+echo Resizing the video to 512x512
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -y data/raw/videos/${VIDEO_ID}_512.mp4
+mv data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
+mv data/raw/videos/${VIDEO_ID}_512.mp4 data/raw/videos/${VIDEO_ID}.mp4
+echo Done
+echo The old video is moved to data/raw/videos/${VIDEO_ID}.mp4 data/raw/videos/${VIDEO_ID}_to_rm.mp4
+echo mkdir -p data/processed/videos/${VIDEO_ID}
+mkdir -p data/processed/videos/${VIDEO_ID}
+echo Done
+# extract audio file from the training video
+echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -f wav -ar 16000 -v quiet -y data/processed/videos/${VIDEO_ID}/aud.wav
+echo Done
+# extract hubert_mel_f0 from audio
+echo python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
+python data_gen/utils/process_audio/extract_hubert.py --video_id=${VIDEO_ID}
+echo python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
+python data_gen/utils/process_audio/extract_mel_f0.py --video_id=${VIDEO_ID}
+echo Done
+# extract segment images
+echo mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
+mkdir -p data/processed/videos/${VIDEO_ID}/gt_imgs
+echo ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
+ffmpeg -i data/raw/videos/${VIDEO_ID}.mp4 -vf fps=25,scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet data/processed/videos/${VIDEO_ID}/gt_imgs/%08d.jpg
+echo Done
+echo python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
+python data_gen/utils/process_video/extract_segment_imgs.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 # extract image, segmap, and background
+echo Done
+echo python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
+python data_gen/utils/process_video/extract_lm2d.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4
+echo Done
+pkill -f void*
+echo python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global
+python data_gen/utils/process_video/fit_3dmm_landmark.py --ds_name=nerf --vid_dir=data/raw/videos/${VIDEO_ID}.mp4 --reset --debug --id_mode=global
+echo Done
+echo python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
+python data_gen/runs/binarizer_nerf.py --video_id=${VIDEO_ID}
+echo Done

data_gen/utils/mp_feature_extractors/face_landmarker.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import mediapipe as mp
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+import numpy as np
+import cv2
+import os
+import copy
+# simplified mediapipe ldm at https://github.com/k-m-irfan/simplified_mediapipe_face_landmarks
+index_lm141_from_lm478 = [70,63,105,66,107,55,65,52,53,46] + [300,293,334,296,336,285,295,282,283,276] + [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249] + [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95] + [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146] + [10,338,297,332,284,251,389,356,454,323,361,288,397,365,379,378,400,377,152,148,176,149,150,136,172,58,132,93,234,127,162,21,54,103,67,109] + [468,469,470,471,472] + [473,474,475,476,477] + [64,4,294]
+# lm141 without iris
+index_lm131_from_lm478 = [70,63,105,66,107,55,65,52,53,46] + [300,293,334,296,336,285,295,282,283,276] + [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249] + [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95] + [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146] + [10,338,297,332,284,251,389,356,454,323,361,288,397,365,379,378,400,377,152,148,176,149,150,136,172,58,132,93,234,127,162,21,54,103,67,109] + [64,4,294]
+# face alignment lm68
+index_lm68_from_lm478 = [127,234,93,132,58,136,150,176,152,400,379,365,288,361,323,454,356,70,63,105,66,107,336,296,334,293,300,168,197,5,4,75,97,2,326,305,
+                         33,160,158,133,153,144,362,385,387,263,373,380,61,40,37,0,267,270,291,321,314,17,84,91,78,81,13,311,308,402,14,178]
+# used for weights for key parts
+unmatch_mask_from_lm478 = [ 93, 127, 132, 234, 323, 356, 361, 454]
+index_eye_from_lm478 = [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249]
+index_innerlip_from_lm478 = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95]
+index_outerlip_from_lm478 = [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146]
+index_withinmouth_from_lm478 = [76, 62] + [184, 183, 74, 72, 73, 41, 72, 38, 11, 12, 302, 268, 303, 271, 304, 272, 408, 407] + [292, 306] +  [325, 307, 319, 320, 403, 404, 316, 315, 15, 16, 86, 85, 179, 180, 89, 90, 96, 77]
+index_mouth_from_lm478 = index_innerlip_from_lm478 + index_outerlip_from_lm478 + index_withinmouth_from_lm478
+index_yaw_from_lm68 = list(range(0, 17))
+index_brow_from_lm68 = list(range(17, 27))
+index_nose_from_lm68 = list(range(27, 36))
+index_eye_from_lm68 = list(range(36, 48))
+index_mouth_from_lm68 = list(range(48, 68))
+def read_video_to_frames(video_name):
+    frames = []
+    cap = cv2.VideoCapture(video_name)
+    while cap.isOpened():
+        ret, frame_bgr = cap.read()
+        if frame_bgr is None:
+            break
+        frames.append(frame_bgr)
+    frames = np.stack(frames)
+    frames = np.flip(frames, -1) # BGR ==> RGB
+    return frames
+class MediapipeLandmarker:
+    def __init__(self):
+        model_path = 'data_gen/utils/mp_feature_extractors/face_landmarker.task'
+        if not os.path.exists(model_path):
+            os.makedirs(os.path.dirname(model_path), exist_ok=True)
+            print("downloading face_landmarker model from mediapipe...")
+            model_url = 'https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/latest/face_landmarker.task'
+            os.system(f"wget {model_url}")
+            os.system(f"mv face_landmarker.task {model_path}")
+            print("download success")
+        base_options = python.BaseOptions(model_asset_path=model_path)
+        self.image_mode_options = vision.FaceLandmarkerOptions(base_options=base_options,
+                        running_mode=vision.RunningMode.IMAGE, # IMAGE, VIDEO, LIVE_STREAM
+                        num_faces=1)
+        self.video_mode_options = vision.FaceLandmarkerOptions(base_options=base_options,
+                        running_mode=vision.RunningMode.VIDEO, # IMAGE, VIDEO, LIVE_STREAM
+                        num_faces=1)
+    def extract_lm478_from_img_name(self, img_name):
+        img = cv2.imread(img_name)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_lm478 = self.extract_lm478_from_img(img)
+        return img_lm478
+    def extract_lm478_from_img(self, img):
+        img_landmarker = vision.FaceLandmarker.create_from_options(self.image_mode_options)
+        frame = mp.Image(image_format=mp.ImageFormat.SRGB, data=img.astype(np.uint8))
+        img_face_landmarker_result = img_landmarker.detect(image=frame)
+        img_ldm_i = img_face_landmarker_result.face_landmarks[0]
+        img_face_landmarks = np.array([[l.x, l.y, l.z] for l in img_ldm_i])
+        H, W, _ = img.shape
+        img_lm478 = np.array(img_face_landmarks)[:, :2] * np.array([W, H]).reshape([1,2]) # [478, 2]
+        return img_lm478
+    def extract_lm478_from_video_name(self, video_name, fps=25, anti_smooth_factor=2):
+        frames = read_video_to_frames(video_name)
+        img_lm478, vid_lm478 = self.extract_lm478_from_frames(frames, fps, anti_smooth_factor)
+        return img_lm478, vid_lm478
+    def extract_lm478_from_frames(self, frames, fps=25, anti_smooth_factor=20):
+        """
+        frames: RGB, uint8
+        anti_smooth_factor: float, 对video模式的interval进行修改, 1代表无修改, 越大越接近image mode
+        """
+        img_mpldms = []
+        vid_mpldms = []
+        img_landmarker = vision.FaceLandmarker.create_from_options(self.image_mode_options)
+        vid_landmarker = vision.FaceLandmarker.create_from_options(self.video_mode_options)
+        for i in range(len(frames)):
+            frame = mp.Image(image_format=mp.ImageFormat.SRGB, data=frames[i].astype(np.uint8))
+            img_face_landmarker_result = img_landmarker.detect(image=frame)
+            vid_face_landmarker_result = vid_landmarker.detect_for_video(image=frame, timestamp_ms=int((1000/fps)*anti_smooth_factor*i))
+            try:
+                img_ldm_i = img_face_landmarker_result.face_landmarks[0]
+                vid_ldm_i = vid_face_landmarker_result.face_landmarks[0]
+            except:
+                print(f"Warning: failed detect ldm in idx={i}, use previous frame results.")
+            img_face_landmarks = np.array([[l.x, l.y, l.z] for l in img_ldm_i])
+            vid_face_landmarks = np.array([[l.x, l.y, l.z] for l in vid_ldm_i])
+            img_mpldms.append(img_face_landmarks)
+            vid_mpldms.append(vid_face_landmarks)
+        img_lm478 = np.stack(img_mpldms)[..., :2]
+        vid_lm478 = np.stack(vid_mpldms)[..., :2]
+        bs, H, W, _ = frames.shape
+        img_lm478 = np.array(img_lm478)[..., :2] * np.array([W, H]).reshape([1,1,2]) # [T, 478, 2]
+        vid_lm478 = np.array(vid_lm478)[..., :2] * np.array([W, H]).reshape([1,1,2]) # [T, 478, 2]
+        return img_lm478, vid_lm478
+    def combine_vid_img_lm478_to_lm68(self, img_lm478, vid_lm478):
+        img_lm68 = img_lm478[:, index_lm68_from_lm478]
+        vid_lm68 = vid_lm478[:, index_lm68_from_lm478]
+        combined_lm68 = copy.deepcopy(img_lm68)
+        combined_lm68[:, index_yaw_from_lm68] = vid_lm68[:, index_yaw_from_lm68]
+        combined_lm68[:, index_brow_from_lm68] = vid_lm68[:, index_brow_from_lm68]
+        combined_lm68[:, index_nose_from_lm68] = vid_lm68[:, index_nose_from_lm68]
+        return combined_lm68
+    def combine_vid_img_lm478_to_lm478(self, img_lm478, vid_lm478):
+        combined_lm478 = copy.deepcopy(vid_lm478)
+        combined_lm478[:, index_mouth_from_lm478] = img_lm478[:, index_mouth_from_lm478]
+        combined_lm478[:, index_eye_from_lm478] = img_lm478[:, index_eye_from_lm478]
+        return combined_lm478
+if __name__ == '__main__':
+    landmarker = MediapipeLandmarker()
+    ret = landmarker.extract_lm478_from_video_name("00000.mp4")

data_gen/utils/mp_feature_extractors/face_landmarker.task ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
+size 3758596

data_gen/utils/mp_feature_extractors/mp_segmenter.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import os
+import copy
+import numpy as np
+import tqdm
+import mediapipe as mp
+import torch
+from mediapipe.tasks import python
+from mediapipe.tasks.python import vision
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm, multiprocess_run
+from utils.commons.tensor_utils import convert_to_np
+from sklearn.neighbors import NearestNeighbors
+def scatter_np(condition_img, classSeg=5):
+# def scatter(condition_img, classSeg=19, label_size=(512, 512)):
+    batch, c, height, width = condition_img.shape
+    # if height != label_size[0] or width != label_size[1]:
+        # condition_img= F.interpolate(condition_img, size=label_size, mode='nearest')
+    input_label = np.zeros([batch, classSeg, condition_img.shape[2], condition_img.shape[3]]).astype(np.int_)
+    # input_label = torch.zeros(batch, classSeg, *label_size, device=condition_img.device)
+    np.put_along_axis(input_label, condition_img, 1, 1)
+    return input_label
+def scatter(condition_img, classSeg=19):
+# def scatter(condition_img, classSeg=19, label_size=(512, 512)):
+    batch, c, height, width = condition_img.size()
+    # if height != label_size[0] or width != label_size[1]:
+        # condition_img= F.interpolate(condition_img, size=label_size, mode='nearest')
+    input_label = torch.zeros(batch, classSeg, condition_img.shape[2], condition_img.shape[3], device=condition_img.device)
+    # input_label = torch.zeros(batch, classSeg, *label_size, device=condition_img.device)
+    return input_label.scatter_(1, condition_img.long(), 1)
+def encode_segmap_mask_to_image(segmap):
+    # rgb
+    _,h,w = segmap.shape
+    encoded_img = np.ones([h,w,3],dtype=np.uint8) * 255
+    colors = [(255,255,255),(255,255,0),(255,0,255),(0,255,255),(255,0,0),(0,255,0)]
+    for i, color in enumerate(colors):
+        mask = segmap[i].astype(int)
+        index = np.where(mask != 0)
+        encoded_img[index[0], index[1], :] = np.array(color)
+    return encoded_img.astype(np.uint8)
+def decode_segmap_mask_from_image(encoded_img):
+    # rgb
+    colors = [(255,255,255),(255,255,0),(255,0,255),(0,255,255),(255,0,0),(0,255,0)]
+    bg = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 255)
+    hair = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 0)
+    body_skin = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 0) & (encoded_img[..., 2] == 255)
+    face_skin = (encoded_img[..., 0] == 0) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 255)
+    clothes = (encoded_img[..., 0] == 255) & (encoded_img[..., 1] == 0) & (encoded_img[..., 2] == 0)
+    others = (encoded_img[..., 0] == 0) & (encoded_img[..., 1] == 255) & (encoded_img[..., 2] == 0)
+    segmap = np.stack([bg, hair, body_skin, face_skin, clothes, others], axis=0)
+    return segmap.astype(np.uint8)
+def read_video_frame(video_name, frame_id):
+    # https://blog.csdn.net/bby1987/article/details/108923361
+    # frame_num = video_capture.get(cv2.CAP_PROP_FRAME_COUNT) # ==> 总帧数
+    # fps = video_capture.get(cv2.CAP_PROP_FPS)               # ==> 帧率
+    # width = video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)     # ==> 视频宽度
+    # height = video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)   # ==> 视频高度
+    # pos = video_capture.get(cv2.CAP_PROP_POS_FRAMES)        # ==> 句柄位置
+    # video_capture.set(cv2.CAP_PROP_POS_FRAMES, 1000)        # ==> 设置句柄位置
+    # pos = video_capture.get(cv2.CAP_PROP_POS_FRAMES)        # ==> 此时 pos = 1000.0
+    # video_capture.release()
+    vr = cv2.VideoCapture(video_name)
+    vr.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
+    _, frame = vr.read()
+    return frame
+def decode_segmap_mask_from_segmap_video_frame(video_frame):
+    # video_frame: 0~255 BGR, obtained by read_video_frame
+    def assign_values(array):
+        remainder = array % 40  # 计算数组中每个值与40的余数
+        assigned_values = np.where(remainder <= 20, array - remainder, array + (40 - remainder))
+        return assigned_values
+    segmap = video_frame.mean(-1)
+    segmap = assign_values(segmap) // 40 # [H, W] with value 0~5
+    segmap_mask = scatter_np(segmap[None, None, ...], classSeg=6)[0] # [6, H, W]
+    return segmap.astype(np.uint8)
+def extract_background(img_lst, segmap_lst=None):
+    """
+    img_lst: list of rgb ndarray
+    """
+    # only use 1/20 images
+    num_frames = len(img_lst)
+    img_lst = img_lst[::20] if num_frames > 20 else img_lst[0:1]
+    if segmap_lst is not None:
+        segmap_lst = segmap_lst[::20] if num_frames > 20 else segmap_lst[0:1]
+        assert len(img_lst) == len(segmap_lst)
+    # get H/W
+    h, w = img_lst[0].shape[:2]
+    # nearest neighbors
+    all_xys = np.mgrid[0:h, 0:w].reshape(2, -1).transpose()
+    distss = []
+    for idx, img in enumerate(img_lst):
+        if segmap_lst is not None:
+            segmap = segmap_lst[idx]
+        else:
+            segmap = seg_model._cal_seg_map(img)
+        bg = (segmap[0]).astype(bool)
+        fg_xys = np.stack(np.nonzero(~bg)).transpose(1, 0)
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+        dists, _ = nbrs.kneighbors(all_xys)
+        distss.append(dists)
+    distss = np.stack(distss)
+    max_dist = np.max(distss, 0)
+    max_id = np.argmax(distss, 0)
+    bc_pixs = max_dist > 10 # 5
+    bc_pixs_id = np.nonzero(bc_pixs)
+    bc_ids = max_id[bc_pixs]
+    num_pixs = distss.shape[1]
+    imgs = np.stack(img_lst).reshape(-1, num_pixs, 3)
+    bg_img = np.zeros((h*w, 3), dtype=np.uint8)
+    bg_img[bc_pixs_id, :] = imgs[bc_ids, bc_pixs_id, :]
+    bg_img = bg_img.reshape(h, w, 3)
+    max_dist = max_dist.reshape(h, w)
+    bc_pixs = max_dist > 10 # 5
+    bg_xys = np.stack(np.nonzero(~bc_pixs)).transpose()
+    fg_xys = np.stack(np.nonzero(bc_pixs)).transpose()
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+    distances, indices = nbrs.kneighbors(bg_xys)
+    bg_fg_xys = fg_xys[indices[:, 0]]
+    bg_img[bg_xys[:, 0], bg_xys[:, 1], :] = bg_img[bg_fg_xys[:, 0], bg_fg_xys[:, 1], :]
+    return bg_img
+class MediapipeSegmenter:
+    def __init__(self):
+        model_path = 'data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite'
+        if not os.path.exists(model_path):
+            os.makedirs(os.path.dirname(model_path), exist_ok=True)
+            print("downloading segmenter model from mediapipe...")
+            os.system(f"wget https://storage.googleapis.com/mediapipe-models/image_segmenter/selfie_multiclass_256x256/float32/latest/selfie_multiclass_256x256.tflite")
+            os.system(f"mv selfie_multiclass_256x256.tflite {model_path}")
+            print("download success")
+        base_options = python.BaseOptions(model_asset_path=model_path)
+        self.options = vision.ImageSegmenterOptions(base_options=base_options,running_mode=vision.RunningMode.IMAGE, output_category_mask=True)
+        self.video_options = vision.ImageSegmenterOptions(base_options=base_options,running_mode=vision.RunningMode.VIDEO, output_category_mask=True)
+    def _cal_seg_map_for_video(self, imgs, segmenter=None, return_onehot_mask=True, return_segmap_image=True, debug_fill=False):
+        segmenter = vision.ImageSegmenter.create_from_options(self.video_options) if segmenter is None else segmenter
+        assert return_onehot_mask or return_segmap_image # you should at least return one
+        segmap_masks = []
+        segmap_images = []
+        for i in tqdm.trange(len(imgs), desc="extracting segmaps from a video..."):
+        # for i in range(len(imgs)):
+            img = imgs[i]
+            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
+            out = segmenter.segment_for_video(mp_image, 40 * i)
+            segmap = out.category_mask.numpy_view().copy() # [H, W]
+            if debug_fill:
+                # print(f'segmap {segmap}')
+                for x in range(-80 + 1, 0):
+                    for y in range(200, 350):
+                        segmap[x][y] = 4
+            if return_onehot_mask:
+                segmap_mask = scatter_np(segmap[None, None, ...], classSeg=6)[0] # [6, H, W]
+                segmap_masks.append(segmap_mask)
+            if return_segmap_image:
+                segmap_image = segmap[:, :, None].repeat(3, 2).astype(float)
+                segmap_image = (segmap_image * 40).astype(np.uint8)
+                segmap_images.append(segmap_image)
+        if return_onehot_mask and return_segmap_image:
+            return segmap_masks, segmap_images
+        elif return_onehot_mask:
+            return segmap_masks
+        elif return_segmap_image:
+            return segmap_images
+    def _cal_seg_map(self, img, segmenter=None, return_onehot_mask=True):
+        """
+        segmenter: vision.ImageSegmenter.create_from_options(options)
+        img: numpy, [H, W, 3], 0~255
+        segmap: [C, H, W]
+        0 - background
+        1 - hair
+        2 - body-skin
+        3 - face-skin
+        4 - clothes
+        5 - others (accessories)
+        """
+        assert img.ndim == 3
+        segmenter = vision.ImageSegmenter.create_from_options(self.options) if segmenter is None else segmenter
+        image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
+        out = segmenter.segment(image)
+        segmap = out.category_mask.numpy_view().copy() # [H, W]
+        if return_onehot_mask:
+            segmap = scatter_np(segmap[None, None, ...], classSeg=6)[0] # [6, H, W]
+        return segmap
+    def _seg_out_img_with_segmap(self, img, segmap, mode='head'):
+        """
+        img: [h,w,c], img is in 0~255, np
+        """
+        #
+        img = copy.deepcopy(img)
+        if mode == 'head':
+            selected_mask = segmap[[1,3,5] , :, :].sum(axis=0)[None,:] > 0.5 # glasses 也属于others
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+            # selected_mask = segmap[[1,3] , :, :].sum(dim=0, keepdim=True) > 0.5
+        elif mode == 'person':
+            selected_mask = segmap[[1,2,3,4,5], :, :].sum(axis=0)[None,:] > 0.5
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'torso':
+            selected_mask = segmap[[2,4], :, :].sum(axis=0)[None,:] > 0.5
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'torso_with_bg':
+            selected_mask = segmap[[0, 2,4], :, :].sum(axis=0)[None,:] > 0.5
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'bg':
+            selected_mask = segmap[[0], :, :].sum(axis=0)[None,:] > 0.5  # only seg out 0, which means background
+            img[~selected_mask.repeat(3,axis=0).transpose(1,2,0)] = 0 # (-1,-1,-1) denotes black in our [-1,1] convention
+        elif mode == 'full':
+            pass
+        else:
+            raise NotImplementedError()
+        return img, selected_mask
+    def _seg_out_img(self, img, segmenter=None, mode='head'):
+        """
+        imgs [H, W, 3] 0-255
+        return : person_img [B, 3, H, W]
+        """
+        segmenter = vision.ImageSegmenter.create_from_options(self.options) if segmenter is None else segmenter
+        segmap = self._cal_seg_map(img, segmenter=segmenter, return_onehot_mask=True) # [B, 19, H, W]
+        return self._seg_out_img_with_segmap(img, segmap, mode=mode)
+    def seg_out_imgs(self, img, mode='head'):
+        """
+        api for pytorch img, -1~1
+        img: [B, 3, H, W], -1~1
+        """
+        device = img.device
+        img = convert_to_np(img.permute(0, 2, 3, 1)) # [B, H, W, 3]
+        img = ((img + 1) * 127.5).astype(np.uint8)
+        img_lst = [copy.deepcopy(img[i]) for i in range(len(img))]
+        out_lst = []
+        for im in img_lst:
+            out = self._seg_out_img(im, mode=mode)
+            out_lst.append(out)
+        seg_imgs = np.stack(out_lst) # [B, H, W, 3]
+        seg_imgs = (seg_imgs - 127.5) / 127.5
+        seg_imgs = torch.from_numpy(seg_imgs).permute(0, 3, 1, 2).to(device)
+        return seg_imgs
+if __name__ == '__main__':
+    import imageio, cv2, tqdm
+    import torchshow as ts
+    img = imageio.imread("1.png")
+    img = cv2.resize(img, (512,512))
+    seg_model = MediapipeSegmenter()
+    img = torch.tensor(img).unsqueeze(0).repeat([1, 1, 1, 1]).permute(0, 3,1,2)
+    img = (img-127.5)/127.5
+    out = seg_model.seg_out_imgs(img, 'torso')
+    ts.save(out,"torso.png")
+    out = seg_model.seg_out_imgs(img, 'head')
+    ts.save(out,"head.png")
+    out = seg_model.seg_out_imgs(img, 'bg')
+    ts.save(out,"bg.png")
+    img = convert_to_np(img.permute(0, 2, 3, 1)) # [B, H, W, 3]
+    img = ((img + 1) * 127.5).astype(np.uint8)
+    bg = extract_background(img)
+    ts.save(bg,"bg2.png")

data_gen/utils/mp_feature_extractors/selfie_multiclass_256x256.tflite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6748b1253a99067ef71f7e26ca71096cd449baefa8f101900ea23016507e0e0
+size 16371837

data_gen/utils/path_converter.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+class PathConverter():
+    def __init__(self):
+        self.prefixs = {
+            "vid": "/video/",
+            "gt": "/gt_imgs/",
+            "head": "/head_imgs/",
+            "torso": "/torso_imgs/",
+            "person": "/person_imgs/",
+            "torso_with_bg": "/torso_with_bg_imgs/",
+            "single_bg": "/bg_img/",
+            "bg": "/bg_imgs/",
+            "segmaps": "/segmaps/",
+            "inpaint_torso": "/inpaint_torso_imgs/",
+            "com": "/com_imgs/",
+            "inpaint_torso_with_com_bg": "/inpaint_torso_with_com_bg_imgs/",
+        }
+    def to(self, path: str, old_pattern: str, new_pattern: str):
+        return path.replace(self.prefixs[old_pattern], self.prefixs[new_pattern], 1)
+pc = PathConverter()

data_gen/utils/process_audio/extract_hubert.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from transformers import Wav2Vec2Processor, HubertModel
+import soundfile as sf
+import numpy as np
+import torch
+import os
+from utils.commons.hparams import set_hparams, hparams
+wav2vec2_processor = None
+hubert_model = None
+def get_hubert_from_16k_wav(wav_16k_name):
+    speech_16k, _ = sf.read(wav_16k_name)
+    hubert = get_hubert_from_16k_speech(speech_16k)
+    return hubert
+@torch.no_grad()
+def get_hubert_from_16k_speech(speech, device="cuda:0"):
+    global hubert_model, wav2vec2_processor
+    local_path = '/home/tiger/.cache/huggingface/hub/models--facebook--hubert-large-ls960-ft/snapshots/ece5fabbf034c1073acae96d5401b25be96709d8'
+    if hubert_model is None:
+        print("Loading the HuBERT Model...")
+        if os.path.exists(local_path):
+            hubert_model = HubertModel.from_pretrained(local_path)
+        else:
+            hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+    hubert_model = hubert_model.to(device)
+    if wav2vec2_processor is None:
+        print("Loading the Wav2Vec2 Processor...")
+        if os.path.exists(local_path):
+            wav2vec2_processor = Wav2Vec2Processor.from_pretrained(local_path)
+        else:
+            wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+    if speech.ndim ==2:
+        speech = speech[:, 0] # [T, 2] ==> [T,]
+    input_values_all = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000).input_values # [1, T]
+    input_values_all = input_values_all.to(device)
+    # For long audio sequence, due to the memory limitation, we cannot process them in one run
+    # HuBERT process the wav with a CNN of stride [5,2,2,2,2,2], making a stride of 320
+    # Besides, the kernel is [10,3,3,3,3,2,2], making 400 a fundamental unit to get 1 time step.
+    # So the CNN is euqal to a big Conv1D with kernel k=400 and stride s=320
+    # We have the equation to calculate out time step: T = floor((t-k)/s)
+    # To prevent overlap, we set each clip length of (K+S*(N-1)), where N is the expected length T of this clip
+    # The start point of next clip should roll back with a length of (kernel-stride) so it is stride * N
+    kernel = 400
+    stride = 320
+    clip_length = stride * 1000
+    num_iter = input_values_all.shape[1] // clip_length
+    expected_T = (input_values_all.shape[1] - (kernel-stride)) // stride
+    res_lst = []
+    for i in range(num_iter):
+        if i == 0:
+            start_idx = 0
+            end_idx = clip_length - stride + kernel
+        else:
+            start_idx = clip_length * i
+            end_idx = start_idx + (clip_length - stride + kernel)
+        input_values = input_values_all[:, start_idx: end_idx]
+        hidden_states = hubert_model.forward(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+        res_lst.append(hidden_states[0])
+    if num_iter > 0:
+        input_values = input_values_all[:, clip_length * num_iter:]
+    else:
+        input_values = input_values_all
+    if input_values.shape[1] >= kernel: # if the last batch is shorter than kernel_size, skip it
+        hidden_states = hubert_model(input_values).last_hidden_state # [B=1, T=pts//320, hid=1024]
+        res_lst.append(hidden_states[0])
+    ret = torch.cat(res_lst, dim=0).cpu() # [T, 1024]
+    assert abs(ret.shape[0] - expected_T) <= 1
+    if ret.shape[0] < expected_T: # if skipping the last short
+        ret = torch.cat([ret, ret[:, -1:, :].repeat([1,expected_T-ret.shape[0],1])], dim=1)
+    else:
+        ret = ret[:expected_T]
+    return ret
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--video_id', type=str, default='May', help='')
+    args = parser.parse_args()
+    ### Process Single Long Audio for NeRF dataset
+    person_id = args.video_id
+    wav_16k_name = f"data/processed/videos/{person_id}/aud.wav"
+    hubert_npy_name = f"data/processed/videos/{person_id}/aud_hubert.npy"
+    speech_16k, _ = sf.read(wav_16k_name)
+    hubert_hidden = get_hubert_from_16k_speech(speech_16k)
+    np.save(hubert_npy_name, hubert_hidden.detach().numpy())
+    print(f"Saved at {hubert_npy_name}")

data_gen/utils/process_audio/extract_mel_f0.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import numpy as np
+import torch
+import glob
+import os
+import tqdm
+import librosa
+import parselmouth
+from utils.commons.pitch_utils import f0_to_coarse
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from utils.commons.os_utils import multiprocess_glob
+from utils.audio.io import save_wav
+from moviepy.editor import VideoFileClip
+from utils.commons.hparams import hparams, set_hparams
+def resample_wav(wav_name, out_name, sr=16000):
+    wav_raw, sr = librosa.core.load(wav_name, sr=sr)
+    save_wav(wav_raw, out_name, sr)
+def split_wav(mp4_name, wav_name=None):
+    if wav_name is None:
+        wav_name = mp4_name.replace(".mp4", ".wav").replace("/video/", "/audio/")
+    if os.path.exists(wav_name):
+        return wav_name
+    os.makedirs(os.path.dirname(wav_name), exist_ok=True)
+    video = VideoFileClip(mp4_name,verbose=False)
+    dur = video.duration
+    audio = video.audio
+    assert audio is not None
+    audio.write_audiofile(wav_name,fps=16000,verbose=False,logger=None)
+    return wav_name
+def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
+    '''compute right padding (final frame) or both sides padding (first and final frames)
+    '''
+    assert pad_sides in (1, 2)
+    # return int(fsize // 2)
+    pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+    if pad_sides == 1:
+        return 0, pad
+    else:
+        return pad // 2, pad // 2 + pad % 2
+def extract_mel_from_fname(wav_path,
+                      fft_size=512,
+                      hop_size=320,
+                      win_length=512,
+                      window="hann",
+                      num_mels=80,
+                      fmin=80,
+                      fmax=7600,
+                      eps=1e-6,
+                      sample_rate=16000,
+                      min_level_db=-100):
+    if isinstance(wav_path, str):
+        wav, _ = librosa.core.load(wav_path, sr=sample_rate)
+    else:
+        wav = wav_path
+    # get amplitude spectrogram
+    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
+                          win_length=win_length, window=window, center=False)
+    spc = np.abs(x_stft)  # (n_bins, T)
+    # get mel basis
+    fmin = 0 if fmin == -1 else fmin
+    fmax = sample_rate / 2 if fmax == -1 else fmax
+    mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax)
+    mel = mel_basis @ spc
+    mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
+    mel = mel.T
+    l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
+    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    return wav.T, mel
+def extract_f0_from_wav_and_mel(wav, mel,
+                        hop_size=320,
+                        audio_sample_rate=16000,
+                        ):
+    time_step = hop_size / audio_sample_rate * 1000
+    f0_min = 80
+    f0_max = 750
+    f0 = parselmouth.Sound(wav, audio_sample_rate).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    delta_l = len(mel) - len(f0)
+    assert np.abs(delta_l) <= 8
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    f0 = f0[:len(mel)]
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def extract_mel_f0_from_fname(wav_name=None, out_name=None):
+    try:
+        out_name = wav_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/")
+        os.makedirs(os.path.dirname(out_name), exist_ok=True)
+        wav, mel = extract_mel_from_fname(wav_name)
+        f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
+        out_dict = {
+            "mel": mel, # [T, 80]
+            "f0": f0,
+        }
+        np.save(out_name, out_dict)
+    except Exception as e:
+        print(e)
+def extract_mel_f0_from_video_name(mp4_name, wav_name=None, out_name=None):
+    if mp4_name.endswith(".mp4"):
+        wav_name = split_wav(mp4_name, wav_name)
+        if out_name is None:
+            out_name = mp4_name.replace(".mp4", "_mel_f0.npy").replace("/video/", "/mel_f0/")
+    elif mp4_name.endswith(".wav"):
+        wav_name = mp4_name
+        if out_name is None:
+            out_name = mp4_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/")
+    os.makedirs(os.path.dirname(out_name), exist_ok=True)
+    wav, mel = extract_mel_from_fname(wav_name)
+    f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
+    out_dict = {
+        "mel": mel, # [T, 80]
+        "f0": f0,
+    }
+    np.save(out_name, out_dict)
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument('--video_id', type=str, default='May', help='')
+    args = parser.parse_args()
+    ### Process Single Long Audio for NeRF dataset
+    person_id = args.video_id
+    wav_16k_name = f"data/processed/videos/{person_id}/aud.wav"
+    out_name = f"data/processed/videos/{person_id}/aud_mel_f0.npy"
+    extract_mel_f0_from_video_name(wav_16k_name, out_name)
+    print(f"Saved at {out_name}")

data_gen/utils/process_audio/resample_audio_to_16k.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os, glob
+from utils.commons.os_utils import multiprocess_glob
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+def extract_wav16k_job(audio_name:str):
+    out_path = audio_name.replace("/audio_raw/","/audio/",1)
+    assert out_path != audio_name # prevent inplace
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    cmd = f'{ffmpeg_path} -i {audio_name} -ar 16000 -v quiet -y {out_path}'
+    os.system(cmd)
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--aud_dir", default='/home/tiger/datasets/raw/CMLR/audio_raw/')
+    parser.add_argument("--ds_name", default='CMLR')
+    parser.add_argument("--num_workers", default=64, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    args = parser.parse_args()
+    print(f"args {args}")
+    aud_dir = args.aud_dir
+    ds_name = args.ds_name
+    if ds_name in ['CMLR']:
+        aud_name_pattern = os.path.join(aud_dir, "*/*/*.wav")
+        aud_names = multiprocess_glob(aud_name_pattern)
+    else:
+        raise NotImplementedError()
+    aud_names = sorted(aud_names)
+    print(f"total audio number : {len(aud_names)}")
+    print(f"first {aud_names[0]} last {aud_names[-1]}")
+    # exit()
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(aud_names) // total_process
+        if process_id == total_process:
+            aud_names = aud_names[process_id * num_samples_per_process : ]
+        else:
+            aud_names = aud_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    for i, res in multiprocess_run_tqdm(extract_wav16k_job, aud_names, num_workers=args.num_workers, desc="resampling videos"):
+        pass

data_gen/utils/process_image/extract_lm2d.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import sys
+import glob
+import cv2
+import tqdm
+import numpy as np
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+import warnings
+warnings.filterwarnings('ignore')
+import random
+random.seed(42)
+import pickle
+import json
+import gzip
+from typing import Any
+def load_file(filename, is_gzip: bool = False, is_json: bool = False) -> Any:
+    if is_json:
+        if is_gzip:
+            with gzip.open(filename, "r", encoding="utf-8") as f:
+                loaded_object = json.load(f)
+                return loaded_object
+        else:
+            with open(filename, "r", encoding="utf-8") as f:
+                loaded_object = json.load(f)
+                return loaded_object
+    else:
+        if is_gzip:
+            with gzip.open(filename, "rb") as f:
+                loaded_object = pickle.load(f)
+                return loaded_object
+        else:
+            with open(filename, "rb") as f:
+                loaded_object = pickle.load(f)
+                return loaded_object
+def save_file(filename, content, is_gzip: bool = False, is_json: bool = False) -> None:
+    if is_json:
+        if is_gzip:
+            with gzip.open(filename, "w", encoding="utf-8") as f:
+                json.dump(content, f)
+        else:
+            with open(filename, "w", encoding="utf-8") as f:
+                json.dump(content, f)
+    else:
+        if is_gzip:
+            with gzip.open(filename, "wb") as f:
+                pickle.dump(content, f)
+        else:
+            with open(filename, "wb") as f:
+                pickle.dump(content, f)
+face_landmarker = None
+def extract_lms_mediapipe_job(img):
+    if img is None:
+        return None
+    global face_landmarker
+    if face_landmarker is None:
+        face_landmarker = MediapipeLandmarker()
+    lm478 = face_landmarker.extract_lm478_from_img(img)
+    return lm478
+def extract_landmark_job(img_name):
+    try:
+        # if img_name == 'datasets/PanoHeadGen/raw/images/multi_view/chunk_0/seed0000002.png':
+            # print(1)
+            # input()
+        out_name = img_name.replace("/images_512/", "/lms_2d/").replace(".png","_lms.npy")
+        if os.path.exists(out_name):
+            print("out exists, skip...")
+            return
+        try:
+            os.makedirs(os.path.dirname(out_name), exist_ok=True)
+        except:
+            pass
+        img = cv2.imread(img_name)[:,:,::-1]
+        if img is not None:
+            lm468 = extract_lms_mediapipe_job(img)
+            if lm468 is not None:
+                np.save(out_name, lm468)
+        # print("Hahaha, solve one item!!!")
+    except Exception as e:
+        print(e)
+        pass
+def out_exist_job(img_name):
+    out_name = img_name.replace("/images_512/", "/lms_2d/").replace(".png","_lms.npy")
+    if  os.path.exists(out_name):
+        return None
+    else:
+        return img_name
+# def get_todo_img_names(img_names):
+#     todo_img_names = []
+#     for i, res in multiprocess_run_tqdm(out_exist_job, img_names, num_workers=64):
+#         if res is not None:
+#             todo_img_names.append(res)
+#     return todo_img_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_dir", default='/home/tiger/datasets/raw/FFHQ/images_512/')
+    parser.add_argument("--ds_name", default='FFHQ')
+    parser.add_argument("--num_workers", default=64, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--img_names_file", default="img_names.pkl", type=str)
+    parser.add_argument("--load_img_names", action="store_true")
+    args = parser.parse_args()
+    print(f"args {args}")
+    img_dir = args.img_dir
+    img_names_file = os.path.join(img_dir, args.img_names_file)
+    if args.load_img_names:
+        img_names = load_file(img_names_file)
+        print(f"load image names from {img_names_file}")
+    else:
+        if args.ds_name == 'FFHQ_MV':
+            img_name_pattern1 = os.path.join(img_dir, "ref_imgs/*.png")
+            img_names1 = glob.glob(img_name_pattern1)
+            img_name_pattern2 = os.path.join(img_dir, "mv_imgs/*.png")
+            img_names2 = glob.glob(img_name_pattern2)
+            img_names = img_names1 + img_names2
+            img_names = sorted(img_names)
+        elif args.ds_name == 'FFHQ':
+            img_name_pattern = os.path.join(img_dir, "*.png")
+            img_names = glob.glob(img_name_pattern)
+            img_names = sorted(img_names)
+        elif args.ds_name == "PanoHeadGen":
+            # img_name_patterns = ["ref/*/*.png", "multi_view/*/*.png", "reverse/*/*.png"]
+            img_name_patterns = ["ref/*/*.png"]
+            img_names = []
+            for img_name_pattern in img_name_patterns:
+                img_name_pattern_full = os.path.join(img_dir, img_name_pattern)
+                img_names_part = glob.glob(img_name_pattern_full)
+                img_names.extend(img_names_part)
+            img_names = sorted(img_names)
+    # save image names
+    if not args.load_img_names:
+        save_file(img_names_file, img_names)
+        print(f"save image names in {img_names_file}")
+    print(f"total images number: {len(img_names)}")
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(img_names) // total_process
+        if process_id == total_process:
+            img_names = img_names[process_id * num_samples_per_process : ]
+        else:
+            img_names = img_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    # if not args.reset:
+        # img_names = get_todo_img_names(img_names)
+    print(f"todo_image {img_names[:10]}")
+    print(f"processing images number in this process: {len(img_names)}")
+    # print(f"todo images number: {len(img_names)}")
+    # input()
+    # exit()
+    if args.num_workers == 1:
+        index = 0
+        for img_name in tqdm.tqdm(img_names, desc=f"Root process {args.process_id}: extracting MP-based landmark2d"):
+            try:
+                extract_landmark_job(img_name)
+            except Exception as e:
+                print(e)
+                pass
+            if index % max(1, int(len(img_names) * 0.003)) == 0:
+                print(f"processed {index} / {len(img_names)}")
+                sys.stdout.flush()
+            index += 1
+    else:
+        for i, res in multiprocess_run_tqdm(
+            extract_landmark_job, img_names,
+            num_workers=args.num_workers,
+            desc=f"Root {args.process_id}: extracing MP-based landmark2d"):
+            # if index % max(1, int(len(img_names) * 0.003)) == 0:
+            print(f"processed {i+1} / {len(img_names)}")
+            sys.stdout.flush()
+        print(f"Root {args.process_id}: Finished extracting.")

data_gen/utils/process_image/extract_segment_imgs.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import glob
+import cv2
+import tqdm
+import numpy as np
+import PIL
+from utils.commons.tensor_utils import convert_to_np
+import torch
+import mediapipe as mp
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.mp_feature_extractors.mp_segmenter import MediapipeSegmenter
+from data_gen.utils.process_video.extract_segment_imgs import inpaint_torso_job, extract_background, save_rgb_image_to_path
+seg_model = MediapipeSegmenter()
+def extract_segment_job(img_name):
+    try:
+        img = cv2.imread(img_name)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        segmap = seg_model._cal_seg_map(img)
+        bg_img = extract_background([img], [segmap])
+        out_img_name = img_name.replace("/images_512/",f"/bg_img/").replace(".mp4", ".jpg")
+        save_rgb_image_to_path(bg_img, out_img_name)
+        com_img = img.copy()
+        bg_part = segmap[0].astype(bool)[..., None].repeat(3,axis=-1)
+        com_img[bg_part] = bg_img[bg_part]
+        out_img_name = img_name.replace("/images_512/",f"/com_imgs/")
+        save_rgb_image_to_path(com_img, out_img_name)
+        for mode in ['head', 'torso', 'person', 'torso_with_bg', 'bg']:
+            out_img, _ = seg_model._seg_out_img_with_segmap(img, segmap, mode=mode)
+            out_img_name = img_name.replace("/images_512/",f"/{mode}_imgs/")
+            out_img = cv2.cvtColor(out_img, cv2.COLOR_RGB2BGR)
+            try: os.makedirs(os.path.dirname(out_img_name), exist_ok=True)
+            except: pass
+            cv2.imwrite(out_img_name, out_img)
+        inpaint_torso_img, inpaint_torso_with_bg_img, _, _ = inpaint_torso_job(img, segmap)
+        out_img_name = img_name.replace("/images_512/",f"/inpaint_torso_imgs/")
+        save_rgb_image_to_path(inpaint_torso_img, out_img_name)
+        inpaint_torso_with_bg_img[bg_part] = bg_img[bg_part]
+        out_img_name = img_name.replace("/images_512/",f"/inpaint_torso_with_com_bg_imgs/")
+        save_rgb_image_to_path(inpaint_torso_with_bg_img, out_img_name)
+        return 0
+    except Exception as e:
+        print(e)
+        return 1
+def out_exist_job(img_name):
+    out_name1 = img_name.replace("/images_512/", "/head_imgs/")
+    out_name2 = img_name.replace("/images_512/", "/com_imgs/")
+    out_name3 = img_name.replace("/images_512/", "/inpaint_torso_with_com_bg_imgs/")
+    if  os.path.exists(out_name1) and os.path.exists(out_name2) and os.path.exists(out_name3):
+        return None
+    else:
+        return img_name
+def get_todo_img_names(img_names):
+    todo_img_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, img_names, num_workers=64):
+        if res is not None:
+            todo_img_names.append(res)
+    return todo_img_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_dir", default='./images_512')
+    # parser.add_argument("--img_dir", default='/home/tiger/datasets/raw/FFHQ/images_512')
+    parser.add_argument("--ds_name", default='FFHQ')
+    parser.add_argument("--num_workers", default=1, type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    args = parser.parse_args()
+    img_dir = args.img_dir
+    if args.ds_name == 'FFHQ_MV':
+        img_name_pattern1 = os.path.join(img_dir, "ref_imgs/*.png")
+        img_names1 = glob.glob(img_name_pattern1)
+        img_name_pattern2 = os.path.join(img_dir, "mv_imgs/*.png")
+        img_names2 = glob.glob(img_name_pattern2)
+        img_names = img_names1 + img_names2
+    elif args.ds_name == 'FFHQ':
+        img_name_pattern = os.path.join(img_dir, "*.png")
+        img_names = glob.glob(img_name_pattern)
+    img_names = sorted(img_names)
+    random.seed(args.seed)
+    random.shuffle(img_names)
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(img_names) // total_process
+        if process_id == total_process:
+            img_names = img_names[process_id * num_samples_per_process : ]
+        else:
+            img_names = img_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        img_names = get_todo_img_names(img_names)
+    print(f"todo images number: {len(img_names)}")
+    for vid_name in multiprocess_run_tqdm(extract_segment_job ,img_names, desc=f"Root process {args.process_id}: extracting segment images", num_workers=args.num_workers):
+        pass

data_gen/utils/process_image/fit_3dmm_landmark.py ADDED Viewed

	@@ -0,0 +1,369 @@

+from numpy.core.numeric import require
+from numpy.lib.function_base import quantile
+import torch
+import torch.nn.functional as F
+import copy
+import numpy as np
+import os
+import sys
+import cv2
+import argparse
+import tqdm
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+import pickle
+face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+            camera_distance=10, focal=1015, keypoint_mode='mediapipe')
+face_model.to("cuda")
+index_lm68_from_lm468 = [127,234,93,132,58,136,150,176,152,400,379,365,288,361,323,454,356,70,63,105,66,107,336,296,334,293,300,168,197,5,4,75,97,2,326,305,
+                         33,160,158,133,153,144,362,385,387,263,373,380,61,40,37,0,267,270,291,321,314,17,84,91,78,81,13,311,308,402,14,178]
+dir_path = os.path.dirname(os.path.realpath(__file__))
+LAMBDA_REG_ID = 0.3
+LAMBDA_REG_EXP = 0.05
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+def cal_lan_loss_mp(proj_lan, gt_lan):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan).pow(2)
+    # loss = (proj_lan - gt_lan).abs()
+    unmatch_mask = [ 93, 127, 132, 234, 323, 356, 361, 454]
+    eye = [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249]
+    inner_lip = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95]
+    outer_lip = [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146]
+    weights = torch.ones_like(loss)
+    weights[:, eye] = 5
+    weights[:, inner_lip] = 2
+    weights[:, outer_lip] = 2
+    weights[:, unmatch_mask] = 0
+    loss = loss * weights
+    return torch.mean(loss)
+def cal_lan_loss(proj_lan, gt_lan):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan)** 2
+    # use the ldm weights from deep3drecon, see deep_3drecon/deep_3drecon_models/losses.py
+    weights = torch.zeros_like(loss)
+    weights = torch.ones_like(loss)
+    weights[:, 36:48, :] = 3 # eye 12 points
+    weights[:, -8:, :] =  3 # inner lip 8 points
+    weights[:, 28:31, :] =  3 # nose 3 points
+    loss = loss * weights
+    return torch.mean(loss)
+def set_requires_grad(tensor_list):
+    for tensor in tensor_list:
+        tensor.requires_grad = True
+def read_video_to_frames(img_name):
+    frames = []
+    cap = cv2.VideoCapture(img_name)
+    while cap.isOpened():
+        ret, frame_bgr = cap.read()
+        if frame_bgr is None:
+            break
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        frames.append(frame_rgb)
+    return np.stack(frames)
+@torch.enable_grad()
+def fit_3dmm_for_a_image(img_name, debug=False, keypoint_mode='mediapipe', device="cuda:0", save=True):
+    img = cv2.imread(img_name)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img_h, img_w = img.shape[0], img.shape[0]
+    assert img_h == img_w
+    num_frames = 1
+    lm_name = img_name.replace("/images_512/", "/lms_2d/").replace(".png", "_lms.npy")
+    if lm_name.endswith('_lms.npy') and os.path.exists(lm_name):
+        lms = np.load(lm_name)
+    else:
+        # print("lms_2d file not found, try to extract it from image...")
+        try:
+            landmarker = MediapipeLandmarker()
+            lms = landmarker.extract_lm478_from_img_name(img_name)
+            # lms = landmarker.extract_lm478_from_img(img)
+        except Exception as e:
+            print(e)
+            return
+        if lms is None:
+            print("get None lms_2d, please check whether each frame has one head, exiting...")
+            return
+    lms = lms[:468].reshape([468,2])
+    lms = torch.FloatTensor(lms).to(device=device)
+    lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+    if keypoint_mode == 'mediapipe':
+        cal_lan_loss_fn = cal_lan_loss_mp
+        out_name = img_name.replace("/images_512/", "/coeff_fit_mp/").replace(".png", "_coeff_fit_mp.npy")
+    else:
+        cal_lan_loss_fn = cal_lan_loss
+        out_name = img_name.replace("/images_512/", "/coeff_fit_lm68/").replace(".png", "_coeff_fit_lm68.npy")
+    try:
+        os.makedirs(os.path.dirname(out_name), exist_ok=True)
+    except:
+        pass
+    id_dim, exp_dim = 80, 64
+    sel_ids = np.arange(0, num_frames, 40)
+    sel_num = sel_ids.shape[0]
+    arg_focal = face_model.focal
+    h = w = face_model.center * 2
+    img_scale_factor = img_h / h
+    lms /= img_scale_factor
+    cxy = torch.tensor((w / 2.0, h / 2.0), dtype=torch.float).to(device=device)
+    id_para = lms.new_zeros((num_frames, id_dim), requires_grad=True) # lms.new_zeros((1, id_dim), requires_grad=True)
+    exp_para = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    euler_angle = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans = lms.new_zeros((num_frames, 3), requires_grad=True)
+    focal_length = lms.new_zeros(1, requires_grad=True)
+    focal_length.data += arg_focal
+    set_requires_grad([id_para, exp_para, euler_angle, trans])
+    optimizer_idexp = torch.optim.Adam([id_para, exp_para], lr=.1)
+    optimizer_frame = torch.optim.Adam([euler_angle, trans], lr=.1)
+    # 其他参数初始化，先训练euler和trans
+    for _ in range(200):
+        proj_geo = face_model.compute_for_landmark_fit(
+            id_para, exp_para, euler_angle, trans)
+        loss_lan = cal_lan_loss_fn(proj_geo[:, :, :2], lms.detach())
+        loss = loss_lan
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    for param_group in optimizer_frame.param_groups:
+        param_group['lr'] = 0.1
+    # "jointly roughly training id exp euler trans"
+    for _ in range(200):
+        proj_geo = face_model.compute_for_landmark_fit(
+            id_para, exp_para, euler_angle, trans)
+        loss_lan = cal_lan_loss_fn(
+            proj_geo[:, :, :2], lms.detach())
+        loss_regid = torch.mean(id_para*id_para) # 正则化
+        loss_regexp = torch.mean(exp_para * exp_para)
+        loss = loss_lan  + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP
+        optimizer_idexp.zero_grad()
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_idexp.step()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f},")
+    # print(f"euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    # start fine training, intialize from the roughly trained results
+    id_para_ = lms.new_zeros((num_frames, id_dim), requires_grad=True)
+    id_para_.data = id_para.data.clone()
+    id_para = id_para_
+    exp_para_ = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    exp_para_.data = exp_para.data.clone()
+    exp_para = exp_para_
+    euler_angle_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    euler_angle_.data = euler_angle.data.clone()
+    euler_angle = euler_angle_
+    trans_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans_.data = trans.data.clone()
+    trans = trans_
+    batch_size = 1
+    # "fine fitting the 3DMM in batches"
+    for i in range(int((num_frames-1)/batch_size+1)):
+        if (i+1)*batch_size > num_frames:
+            start_n = num_frames-batch_size
+            sel_ids = np.arange(max(num_frames-batch_size,0), num_frames)
+        else:
+            start_n = i*batch_size
+            sel_ids = np.arange(i*batch_size, i*batch_size+batch_size)
+        sel_lms = lms[sel_ids]
+        sel_id_para = id_para.new_zeros(
+            (batch_size, id_dim), requires_grad=True)
+        sel_id_para.data = id_para[sel_ids].clone()
+        sel_exp_para = exp_para.new_zeros(
+            (batch_size, exp_dim), requires_grad=True)
+        sel_exp_para.data = exp_para[sel_ids].clone()
+        sel_euler_angle = euler_angle.new_zeros(
+            (batch_size, 3), requires_grad=True)
+        sel_euler_angle.data = euler_angle[sel_ids].clone()
+        sel_trans = trans.new_zeros((batch_size, 3), requires_grad=True)
+        sel_trans.data = trans[sel_ids].clone()
+        set_requires_grad([sel_id_para, sel_exp_para, sel_euler_angle, sel_trans])
+        optimizer_cur_batch = torch.optim.Adam(
+            [sel_id_para, sel_exp_para, sel_euler_angle, sel_trans], lr=0.005)
+        for j in range(50):
+            proj_geo = face_model.compute_for_landmark_fit(
+                sel_id_para, sel_exp_para, sel_euler_angle, sel_trans)
+            loss_lan = cal_lan_loss_fn(
+                proj_geo[:, :, :2], lms.unsqueeze(0).detach())
+            loss_regid = torch.mean(sel_id_para*sel_id_para) # 正则化
+            loss_regexp = torch.mean(sel_exp_para*sel_exp_para)
+            loss = loss_lan + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP
+            optimizer_cur_batch.zero_grad()
+            loss.backward()
+            optimizer_cur_batch.step()
+        print(f"batch {i} | loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f}")
+        id_para[sel_ids].data = sel_id_para.data.clone()
+        exp_para[sel_ids].data = sel_exp_para.data.clone()
+        euler_angle[sel_ids].data = sel_euler_angle.data.clone()
+        trans[sel_ids].data = sel_trans.data.clone()
+    coeff_dict = {'id': id_para.detach().cpu().numpy(), 'exp': exp_para.detach().cpu().numpy(),
+                'euler': euler_angle.detach().cpu().numpy(), 'trans': trans.detach().cpu().numpy()}
+    if save:
+        np.save(out_name, coeff_dict, allow_pickle=True)
+    if debug:
+        import imageio
+        debug_name = img_name.replace("/images_512/", "/coeff_fit_mp_debug/").replace(".png", "_debug.png").replace(".jpg", "_debug.jpg")
+        try: os.makedirs(os.path.dirname(debug_name), exist_ok=True)
+        except: pass
+        proj_geo = face_model.compute_for_landmark_fit(id_para, exp_para, euler_angle, trans)
+        lm68s = proj_geo[:,:,:2].detach().cpu().numpy()  # [T, 68,2]
+        lm68s = lm68s * img_scale_factor
+        lms = lms * img_scale_factor
+        lm68s[..., 1] = img_h - lm68s[..., 1] # flip the height axis
+        lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+        lm68s = lm68s.astype(int)
+        lm68s = lm68s.reshape([-1,2])
+        lms = lms.cpu().numpy().astype(int).reshape([-1,2])
+        for lm in lm68s:
+            img = cv2.circle(img, lm, 1, (0, 0, 255), thickness=-1)
+        for gt_lm in lms:
+            img = cv2.circle(img, gt_lm, 2, (255, 0, 0), thickness=1)
+        imageio.imwrite(debug_name, img)
+        print(f"debug img saved at {debug_name}")
+    return coeff_dict
+def out_exist_job(vid_name):
+    out_name = vid_name.replace("/images_512/", "/coeff_fit_mp/").replace(".png","_coeff_fit_mp.npy")
+    # if os.path.exists(out_name) or not os.path.exists(lms_name):
+    if os.path.exists(out_name):
+        return None
+    else:
+        return vid_name
+def get_todo_img_names(img_names):
+    todo_img_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, img_names, num_workers=16):
+        if res is not None:
+            todo_img_names.append(res)
+    return todo_img_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img_dir", default='/home/tiger/datasets/raw/FFHQ/images_512')
+    parser.add_argument("--ds_name", default='FFHQ')
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--keypoint_mode", default='mediapipe', type=str)
+    parser.add_argument("--debug", action='store_true')
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--device", default="cuda:0", type=str)
+    parser.add_argument("--output_log", action='store_true')
+    parser.add_argument("--load_names", action="store_true")
+    args = parser.parse_args()
+    img_dir = args.img_dir
+    load_names = args.load_names
+    print(f"args {args}")
+    if args.ds_name == 'single_img':
+        img_names = [img_dir]
+    else:
+        img_names_path = os.path.join(img_dir, "img_dir.pkl")
+        if os.path.exists(img_names_path) and load_names:
+            print(f"loading vid names from {img_names_path}")
+            img_names = load_file(img_names_path)
+        else:
+            if args.ds_name == 'FFHQ_MV':
+                img_name_pattern1 = os.path.join(img_dir, "ref_imgs/*.png")
+                img_names1 = glob.glob(img_name_pattern1)
+                img_name_pattern2 = os.path.join(img_dir, "mv_imgs/*.png")
+                img_names2 = glob.glob(img_name_pattern2)
+                img_names = img_names1 + img_names2
+                img_names = sorted(img_names)
+            elif args.ds_name == 'FFHQ':
+                img_name_pattern = os.path.join(img_dir, "*.png")
+                img_names = glob.glob(img_name_pattern)
+                img_names = sorted(img_names)
+            elif args.ds_name == "PanoHeadGen":
+                img_name_patterns = ["ref/*/*.png"]
+                img_names = []
+                for img_name_pattern in img_name_patterns:
+                    img_name_pattern_full = os.path.join(img_dir, img_name_pattern)
+                    img_names_part = glob.glob(img_name_pattern_full)
+                    img_names.extend(img_names_part)
+                img_names = sorted(img_names)
+            print(f"saving image names to {img_names_path}")
+            save_file(img_names_path, img_names)
+    # import random
+    # random.seed(args.seed)
+    # random.shuffle(img_names)
+    face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+                camera_distance=10, focal=1015, keypoint_mode=args.keypoint_mode)
+    face_model.to(torch.device(args.device))
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1 and process_id >= 0
+        num_samples_per_process = len(img_names) // total_process
+        if process_id == total_process:
+            img_names = img_names[process_id * num_samples_per_process : ]
+        else:
+            img_names = img_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    print(f"image names number (before fileter): {len(img_names)}")
+    if not args.reset:
+        img_names = get_todo_img_names(img_names)
+    print(f"image names number (after  fileter): {len(img_names)}")
+    for i in tqdm.trange(len(img_names), desc=f"process {process_id}: fitting 3dmm ..."):
+        img_name = img_names[i]
+        try:
+            fit_3dmm_for_a_image(img_name, args.debug, device=args.device)
+        except Exception as e:
+            print(img_name, e)
+        if args.output_log and i % max(int(len(img_names) * 0.003), 1) == 0:
+            print(f"process {process_id}: {i + 1} / {len(img_names)} done")
+            sys.stdout.flush()
+            sys.stderr.flush()
+    print(f"process {process_id}: fitting 3dmm all done")

data_gen/utils/process_video/euler2quaterion.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy as np
+import torch
+import math
+import numba
+from scipy.spatial.transform import Rotation as R
+def euler2quaterion(euler, use_radian=True):
+    """
+    euler: np.array, [batch, 3]
+    return: the quaterion, np.array, [batch, 4]
+    """
+    r = R.from_euler('xyz',euler, degrees=not use_radian)
+    return r.as_quat()
+def quaterion2euler(quat, use_radian=True):
+    """
+    quat: np.array, [batch, 4]
+    return: the euler, np.array, [batch, 3]
+    """
+    r = R.from_quat(quat)
+    return r.as_euler('xyz', degrees=not use_radian)
+def rot2quaterion(rot):
+    r = R.from_matrix(rot)
+    return r.as_quat()
+def quaterion2rot(quat):
+    r = R.from_quat(quat)
+    return r.as_matrix()
+if __name__ == '__main__':
+    euler = np.array([89.999,89.999,89.999] * 100).reshape([100,3])
+    q = euler2quaterion(euler, use_radian=False)
+    e = quaterion2euler(q, use_radian=False)
+    print(" ")

data_gen/utils/process_video/extract_blink.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import numpy as np
+from data_util.face3d_helper import Face3DHelper
+from utils.commons.tensor_utils import convert_to_tensor
+def polygon_area(x, y):
+    """
+    x: [T, K=6]
+    y: [T, K=6]
+    return: [T,]
+    """
+    x_ = x - x.mean(axis=-1, keepdims=True)
+    y_ = y - y.mean(axis=-1, keepdims=True)
+    correction = x_[:,-1] * y_[:,0] - y_[:,-1]* x_[:,0]
+    main_area = (x_[:,:-1] * y_[:,1:]).sum(axis=-1) - (y_[:,:-1] * x_[:,1:]).sum(axis=-1)
+    return 0.5 * np.abs(main_area + correction)
+def get_eye_area_percent(id, exp, face3d_helper):
+    id = convert_to_tensor(id)
+    exp = convert_to_tensor(exp)
+    cano_lm3d = face3d_helper.reconstruct_cano_lm3d(id, exp)
+    cano_lm2d = (cano_lm3d[..., :2] + 1) / 2
+    lms = cano_lm2d.cpu().numpy()
+    eyes_left = slice(36, 42)
+    eyes_right = slice(42, 48)
+    area_left = polygon_area(lms[:, eyes_left, 0], lms[:, eyes_left, 1])
+    area_right = polygon_area(lms[:, eyes_right, 0], lms[:, eyes_right, 1])
+    # area percentage of two eyes of the whole image...
+    area_percent = (area_left + area_right) / 1 * 100 # recommend threshold is 0.25%
+    return area_percent # [T,]
+if __name__ == '__main__':
+    import numpy as np
+    import imageio
+    import cv2
+    import torch
+    from data_gen.utils.process_video.extract_lm2d import extract_lms_mediapipe_job, read_video_to_frames, index_lm68_from_lm468
+    from data_gen.utils.process_video.fit_3dmm_landmark import fit_3dmm_for_a_video
+    from data_util.face3d_helper import Face3DHelper
+    face3d_helper = Face3DHelper()
+    video_name = 'data/raw/videos/May_10s.mp4'
+    frames = read_video_to_frames(video_name)
+    coeff = fit_3dmm_for_a_video(video_name, save=False)
+    area_percent = get_eye_area_percent(torch.tensor(coeff['id']), torch.tensor(coeff['exp']), face3d_helper)
+    writer = imageio.get_writer("1.mp4", fps=25)
+    for idx, frame in enumerate(frames):
+        frame = cv2.putText(frame, f"{area_percent[idx]:.2f}", org=(128,128), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=1, color=(255,0,0), thickness=1)
+        writer.append_data(frame)
+    writer.close()

data_gen/utils/process_video/extract_lm2d.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import sys
+import glob
+import cv2
+import pickle
+import tqdm
+import numpy as np
+import mediapipe as mp
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from utils.commons.os_utils import multiprocess_glob
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker
+import warnings
+import traceback
+warnings.filterwarnings('ignore')
+"""
+基于Face_aligment的lm68已被弃用,因为其：
+1. 对眼睛部位的预测精度极低
+2. 无法在大偏转角度时准确预测被遮挡的下颚线, 导致大角度时3dmm的GT label就是有问题的, 从而影响性能
+我们目前转而使用基于mediapipe的lm68
+"""
+# def extract_landmarks(ori_imgs_dir):
+#     print(f'[INFO] ===== extract face landmarks from {ori_imgs_dir} =====')
+#     fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False)
+#     image_paths = glob.glob(os.path.join(ori_imgs_dir, '*.png'))
+#     for image_path in tqdm.tqdm(image_paths):
+#         out_name = image_path.replace("/images_512/", "/lms_2d/").replace(".png",".lms")
+#         if os.path.exists(out_name):
+#             continue
+#         input = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) # [H, W, 3]
+#         input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
+#         preds = fa.get_landmarks(input)
+#         if preds is None:
+#             print(f"Skip {image_path} for no face detected")
+#             continue
+#         if len(preds) > 0:
+#             lands = preds[0].reshape(-1, 2)[:,:2]
+#             os.makedirs(os.path.dirname(out_name), exist_ok=True)
+#             np.savetxt(out_name, lands, '%f')
+#     del fa
+#     print(f'[INFO] ===== extracted face landmarks =====')
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+face_landmarker = None
+def extract_landmark_job(video_name, nerf=False):
+    try:
+        if nerf:
+            out_name = video_name.replace("/raw/", "/processed/").replace(".mp4","/lms_2d.npy")
+        else:
+            out_name = video_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy")
+        if os.path.exists(out_name):
+            # print("out exists, skip...")
+            return
+        try:
+            os.makedirs(os.path.dirname(out_name), exist_ok=True)
+        except:
+            pass
+        global face_landmarker
+        if face_landmarker is None:
+            face_landmarker = MediapipeLandmarker()
+        img_lm478, vid_lm478 = face_landmarker.extract_lm478_from_video_name(video_name)
+        lm478 = face_landmarker.combine_vid_img_lm478_to_lm478(img_lm478, vid_lm478)
+        np.save(out_name, lm478)
+        return True
+        # print("Hahaha, solve one item!!!")
+    except Exception as e:
+        traceback.print_exc()
+        return False
+def out_exist_job(vid_name):
+    out_name = vid_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy")
+    if os.path.exists(out_name):
+        return None
+    else:
+        return vid_name
+def get_todo_vid_names(vid_names):
+    if len(vid_names) == 1: # nerf
+        return vid_names
+    todo_vid_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, vid_names, num_workers=128):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='nerf')
+    parser.add_argument("--ds_name", default='data/raw/videos/May.mp4')
+    parser.add_argument("--num_workers", default=2, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action="store_true")
+    parser.add_argument("--load_names", action="store_true")
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    load_names = args.load_names
+    if ds_name.lower() == 'nerf': # 处理单个视频
+        vid_names = [vid_dir]
+        out_names = [video_name.replace("/raw/", "/processed/").replace(".mp4","/lms_2d.npy") for video_name in vid_names]
+    else: # 处理整个数据集
+        if ds_name in ['lrs3_trainval']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+        elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*.mp4")
+        elif ds_name in ['lrs2', 'lrs3', 'voxceleb2', 'CMLR']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        elif ds_name in ["RAVDESS", 'VFHQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        else:
+            raise NotImplementedError()
+        vid_names_path = os.path.join(vid_dir, "vid_names.pkl")
+        if os.path.exists(vid_names_path) and load_names:
+            print(f"loading vid names from {vid_names_path}")
+            vid_names = load_file(vid_names_path)
+        else:
+            vid_names = multiprocess_glob(vid_name_pattern)
+        vid_names = sorted(vid_names)
+        if not load_names:
+            print(f"saving vid names to {vid_names_path}")
+            save_file(vid_names_path, vid_names)
+        out_names = [video_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy") for video_name in vid_names]
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        vid_names = get_todo_vid_names(vid_names)
+    print(f"todo videos number: {len(vid_names)}")
+    fail_cnt = 0
+    job_args = [(vid_name, ds_name=='nerf') for vid_name in vid_names]
+    for (i, res) in multiprocess_run_tqdm(extract_landmark_job, job_args, num_workers=args.num_workers, desc=f"Root {args.process_id}: extracing MP-based landmark2d"):
+        if res is False:
+            fail_cnt += 1
+        print(f"finished {i + 1} / {len(vid_names)} = {(i + 1) / len(vid_names):.4f}, failed {fail_cnt} / {i + 1} = {fail_cnt / (i + 1):.4f}")
+        sys.stdout.flush()
+        pass

data_gen/utils/process_video/extract_segment_imgs.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import random
+import glob
+import cv2
+import tqdm
+import numpy as np
+import PIL
+from utils.commons.tensor_utils import convert_to_np
+from utils.commons.os_utils import multiprocess_glob
+import pickle
+import torch
+import mediapipe as mp
+import traceback
+import multiprocessing
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from scipy.ndimage import binary_erosion, binary_dilation
+from sklearn.neighbors import NearestNeighbors
+from mediapipe.tasks.python import vision
+from data_gen.utils.mp_feature_extractors.mp_segmenter import MediapipeSegmenter, encode_segmap_mask_to_image, decode_segmap_mask_from_image
+seg_model   = None
+segmenter   = None
+mat_model   = None
+lama_model  = None
+lama_config = None
+from data_gen.utils.process_video.split_video_to_imgs import extract_img_job
+BG_NAME_MAP = {
+    "knn": "",
+    "mat": "_mat",
+    "ddnm": "_ddnm",
+    "lama": "_lama",
+}
+FRAME_SELECT_INTERVAL = 5
+SIM_METHOD = "mse"
+SIM_THRESHOLD = 3
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+def save_rgb_alpha_image_to_path(img, alpha, img_path):
+    try: os.makedirs(os.path.dirname(img_path), exist_ok=True)
+    except: pass
+    cv2.imwrite(img_path, np.concatenate([cv2.cvtColor(img, cv2.COLOR_RGB2BGR), alpha], axis=-1))
+def save_rgb_image_to_path(img, img_path):
+    try: os.makedirs(os.path.dirname(img_path), exist_ok=True)
+    except: pass
+    cv2.imwrite(img_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
+def load_rgb_image_to_path(img_path):
+    return cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
+def image_similarity(x: np.ndarray, y: np.ndarray, method="mse"):
+    if method == "mse":
+        return np.mean((x - y) ** 2)
+    else:
+        raise NotImplementedError
+def extract_background(img_lst, segmap_mask_lst=None, method="knn", device='cpu', mix_bg=True):
+    """
+    img_lst: list of rgb ndarray
+    method: "knn", "mat" or "ddnm"
+    """
+    # only use 1/20 images
+    global segmenter
+    global seg_model
+    global mat_model
+    global lama_model
+    global lama_config
+    assert len(img_lst) > 0
+    if segmap_mask_lst is not None:
+        assert len(segmap_mask_lst) == len(img_lst)
+    else:
+        del segmenter
+        del seg_model
+        seg_model = MediapipeSegmenter()
+        segmenter = vision.ImageSegmenter.create_from_options(seg_model.video_options)
+    def get_segmap_mask(img_lst, segmap_mask_lst, index):
+        if segmap_mask_lst is not None:
+            segmap = segmap_mask_lst[index]
+        else:
+            segmap = seg_model._cal_seg_map(img_lst[index], segmenter=segmenter)
+        return segmap
+    if method == "knn":
+        num_frames = len(img_lst)
+        img_lst = img_lst[::FRAME_SELECT_INTERVAL] if num_frames > FRAME_SELECT_INTERVAL else img_lst[0:1]
+        if segmap_mask_lst is not None:
+            segmap_mask_lst = segmap_mask_lst[::FRAME_SELECT_INTERVAL] if num_frames > FRAME_SELECT_INTERVAL else segmap_mask_lst[0:1]
+            assert len(img_lst) == len(segmap_mask_lst)
+        # get H/W
+        h, w = img_lst[0].shape[:2]
+        # nearest neighbors
+        all_xys = np.mgrid[0:h, 0:w].reshape(2, -1).transpose() # [512*512, 2] coordinate grid
+        distss = []
+        for idx, img in enumerate(img_lst):
+            segmap = get_segmap_mask(img_lst=img_lst, segmap_mask_lst=segmap_mask_lst, index=idx)
+            bg = (segmap[0]).astype(bool) # [h,w] bool mask
+            fg_xys = np.stack(np.nonzero(~bg)).transpose(1, 0) # [N_nonbg,2] coordinate of non-bg pixels
+            nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+            dists, _ = nbrs.kneighbors(all_xys) # [512*512, 1] distance to nearest non-bg pixel
+            distss.append(dists)
+        distss = np.stack(distss) # [B, 512*512, 1]
+        max_dist = np.max(distss, 0) # [512*512, 1]
+        max_id = np.argmax(distss, 0) # id of frame
+        bc_pixs = max_dist > 10 # 在各个frame有一个出现过是bg的pixel，bg标准是离最近的non-bg pixel距离大于10
+        bc_pixs_id = np.nonzero(bc_pixs)
+        bc_ids = max_id[bc_pixs]
+        num_pixs = distss.shape[1]
+        imgs = np.stack(img_lst).reshape(-1, num_pixs, 3)
+        bg_img = np.zeros((h*w, 3), dtype=np.uint8)
+        bg_img[bc_pixs_id, :] = imgs[bc_ids, bc_pixs_id, :] # 对那些铁bg的pixel，直接去对应的image里面采样
+        bg_img = bg_img.reshape(h, w, 3)
+        max_dist = max_dist.reshape(h, w)
+        bc_pixs = max_dist > 10 # 5
+        bg_xys = np.stack(np.nonzero(~bc_pixs)).transpose()
+        fg_xys = np.stack(np.nonzero(bc_pixs)).transpose()
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(fg_xys)
+        distances, indices = nbrs.kneighbors(bg_xys) # 对non-bg img，���KNN找最近的bg pixel
+        bg_fg_xys = fg_xys[indices[:, 0]]
+        bg_img[bg_xys[:, 0], bg_xys[:, 1], :] = bg_img[bg_fg_xys[:, 0], bg_fg_xys[:, 1], :]
+    else:
+        raise NotImplementedError # deperated
+    return bg_img
+def inpaint_torso_job(gt_img, segmap):
+    bg_part = (segmap[0]).astype(bool)
+    head_part = (segmap[1] + segmap[3] + segmap[5]).astype(bool)
+    neck_part = (segmap[2]).astype(bool)
+    torso_part = (segmap[4]).astype(bool)
+    img = gt_img.copy()
+    img[head_part] = 0
+    # torso part "vertical" in-painting...
+    L = 8 + 1
+    torso_coords = np.stack(np.nonzero(torso_part), axis=-1) # [M, 2]
+    # lexsort: sort 2D coords first by y then by x,
+    # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+    inds = np.lexsort((torso_coords[:, 0], torso_coords[:, 1]))
+    torso_coords = torso_coords[inds]
+    # choose the top pixel for each column
+    u, uid, ucnt = np.unique(torso_coords[:, 1], return_index=True, return_counts=True)
+    top_torso_coords = torso_coords[uid] # [m, 2]
+    # only keep top-is-head pixels
+    top_torso_coords_up = top_torso_coords.copy() - np.array([1, 0]) # [N, 2]
+    mask = head_part[tuple(top_torso_coords_up.T)]
+    if mask.any():
+        top_torso_coords = top_torso_coords[mask]
+        # get the color
+        top_torso_colors = gt_img[tuple(top_torso_coords.T)] # [m, 3]
+        # construct inpaint coords (vertically up, or minus in x)
+        inpaint_torso_coords = top_torso_coords[None].repeat(L, 0) # [L, m, 2]
+        inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+        inpaint_torso_coords += inpaint_offsets
+        inpaint_torso_coords = inpaint_torso_coords.reshape(-1, 2) # [Lm, 2]
+        inpaint_torso_colors = top_torso_colors[None].repeat(L, 0) # [L, m, 3]
+        darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+        inpaint_torso_colors = (inpaint_torso_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+        # set color
+        img[tuple(inpaint_torso_coords.T)] = inpaint_torso_colors
+        inpaint_torso_mask = np.zeros_like(img[..., 0]).astype(bool)
+        inpaint_torso_mask[tuple(inpaint_torso_coords.T)] = True
+    else:
+        inpaint_torso_mask = None
+    # neck part "vertical" in-painting...
+    push_down = 4
+    L = 48 + push_down + 1
+    neck_part = binary_dilation(neck_part, structure=np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=bool), iterations=3)
+    neck_coords = np.stack(np.nonzero(neck_part), axis=-1) # [M, 2]
+    # lexsort: sort 2D coords first by y then by x,
+    # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+    inds = np.lexsort((neck_coords[:, 0], neck_coords[:, 1]))
+    neck_coords = neck_coords[inds]
+    # choose the top pixel for each column
+    u, uid, ucnt = np.unique(neck_coords[:, 1], return_index=True, return_counts=True)
+    top_neck_coords = neck_coords[uid] # [m, 2]
+    # only keep top-is-head pixels
+    top_neck_coords_up = top_neck_coords.copy() - np.array([1, 0])
+    mask = head_part[tuple(top_neck_coords_up.T)]
+    top_neck_coords = top_neck_coords[mask]
+    # push these top down for 4 pixels to make the neck inpainting more natural...
+    offset_down = np.minimum(ucnt[mask] - 1, push_down)
+    top_neck_coords += np.stack([offset_down, np.zeros_like(offset_down)], axis=-1)
+    # get the color
+    top_neck_colors = gt_img[tuple(top_neck_coords.T)] # [m, 3]
+    # construct inpaint coords (vertically up, or minus in x)
+    inpaint_neck_coords = top_neck_coords[None].repeat(L, 0) # [L, m, 2]
+    inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+    inpaint_neck_coords += inpaint_offsets
+    inpaint_neck_coords = inpaint_neck_coords.reshape(-1, 2) # [Lm, 2]
+    inpaint_neck_colors = top_neck_colors[None].repeat(L, 0) # [L, m, 3]
+    darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+    inpaint_neck_colors = (inpaint_neck_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+    # set color
+    img[tuple(inpaint_neck_coords.T)] = inpaint_neck_colors
+    # apply blurring to the inpaint area to avoid vertical-line artifects...
+    inpaint_mask = np.zeros_like(img[..., 0]).astype(bool)
+    inpaint_mask[tuple(inpaint_neck_coords.T)] = True
+    blur_img = img.copy()
+    blur_img = cv2.GaussianBlur(blur_img, (5, 5), cv2.BORDER_DEFAULT)
+    img[inpaint_mask] = blur_img[inpaint_mask]
+    # set mask
+    torso_img_mask = (neck_part | torso_part | inpaint_mask)
+    torso_with_bg_img_mask = (bg_part | neck_part | torso_part | inpaint_mask)
+    if inpaint_torso_mask is not None:
+        torso_img_mask = torso_img_mask | inpaint_torso_mask
+        torso_with_bg_img_mask = torso_with_bg_img_mask | inpaint_torso_mask
+    torso_img = img.copy()
+    torso_img[~torso_img_mask] = 0
+    torso_with_bg_img = img.copy()
+    torso_img[~torso_with_bg_img_mask] = 0
+    return torso_img, torso_img_mask, torso_with_bg_img, torso_with_bg_img_mask
+def extract_segment_job(video_name, nerf=False, idx=None, total=None, background_method='knn', device="cpu", total_gpus=0, mix_bg=True):
+    global segmenter
+    global seg_model
+    del segmenter
+    del seg_model
+    seg_model = MediapipeSegmenter()
+    segmenter = vision.ImageSegmenter.create_from_options(seg_model.video_options)
+    try:
+        if "cuda" in device:
+            # determine which cuda index from subprocess id
+            pname = multiprocessing.current_process().name
+            pid = int(pname.rsplit("-", 1)[-1]) - 1
+            cuda_id = pid % total_gpus
+            device = f"cuda:{cuda_id}"
+        if nerf: # single video
+            raw_img_dir = video_name.replace(".mp4", "/gt_imgs/").replace("/raw/","/processed/")
+        else: # whole dataset
+            raw_img_dir = video_name.replace(".mp4", "").replace("/video/", "/gt_imgs/")
+        if not os.path.exists(raw_img_dir):
+            extract_img_job(video_name, raw_img_dir) # use ffmpeg to split video into imgs
+        img_names = glob.glob(os.path.join(raw_img_dir, "*.jpg"))
+        img_lst = []
+        for img_name in img_names:
+            img = cv2.imread(img_name)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            img_lst.append(img)
+        segmap_mask_lst, segmap_image_lst = seg_model._cal_seg_map_for_video(img_lst, segmenter=segmenter, return_onehot_mask=True, return_segmap_image=True)
+        del segmap_image_lst
+        # for i in range(len(img_lst)):
+        for i in tqdm.trange(len(img_lst), desc='generating segment images using segmaps...'):
+            img_name = img_names[i]
+            segmap = segmap_mask_lst[i]
+            img = img_lst[i]
+            out_img_name = img_name.replace("/gt_imgs/", "/segmaps/").replace(".jpg", ".png") # 存成jpg的话，pixel value会有误差
+            try: os.makedirs(os.path.dirname(out_img_name), exist_ok=True)
+            except: pass
+            encoded_segmap = encode_segmap_mask_to_image(segmap)
+            save_rgb_image_to_path(encoded_segmap, out_img_name)
+            for mode in ['head', 'torso', 'person', 'bg']:
+                out_img, mask = seg_model._seg_out_img_with_segmap(img, segmap, mode=mode)
+                img_alpha = 255 * np.ones((img.shape[0], img.shape[1], 1), dtype=np.uint8) # alpha
+                mask = mask[0][..., None]
+                img_alpha[~mask] = 0
+                out_img_name = img_name.replace("/gt_imgs/", f"/{mode}_imgs/").replace(".jpg", ".png")
+                save_rgb_alpha_image_to_path(out_img, img_alpha, out_img_name)
+            inpaint_torso_img, inpaint_torso_img_mask, inpaint_torso_with_bg_img, inpaint_torso_with_bg_img_mask = inpaint_torso_job(img, segmap)
+            img_alpha = 255 * np.ones((img.shape[0], img.shape[1], 1), dtype=np.uint8) # alpha
+            img_alpha[~inpaint_torso_img_mask[..., None]] = 0
+            out_img_name = img_name.replace("/gt_imgs/", f"/inpaint_torso_imgs/").replace(".jpg", ".png")
+            save_rgb_alpha_image_to_path(inpaint_torso_img, img_alpha, out_img_name)
+        bg_prefix_name = f"bg{BG_NAME_MAP[background_method]}"
+        bg_img = extract_background(img_lst, segmap_mask_lst, method=background_method, device=device, mix_bg=mix_bg)
+        if nerf:
+            out_img_name = video_name.replace("/raw/", "/processed/").replace(".mp4", f"/{bg_prefix_name}.jpg")
+        else:
+            out_img_name = video_name.replace("/video/", f"/{bg_prefix_name}_img/").replace(".mp4", ".jpg")
+        save_rgb_image_to_path(bg_img, out_img_name)
+        com_prefix_name = f"com{BG_NAME_MAP[background_method]}"
+        for i, img_name in enumerate(img_names):
+            com_img = img_lst[i].copy()
+            segmap = segmap_mask_lst[i]
+            bg_part = segmap[0].astype(bool)[..., None].repeat(3,axis=-1)
+            com_img[bg_part] = bg_img[bg_part]
+            out_img_name = img_name.replace("/gt_imgs/", f"/{com_prefix_name}_imgs/")
+            save_rgb_image_to_path(com_img, out_img_name)
+        return 0
+    except Exception as e:
+        print(str(type(e)), e)
+        traceback.print_exc(e)
+        return 1
+# def check_bg_img_job_finished(raw_img_dir, bg_name, com_dir):
+#     img_names = glob.glob(os.path.join(raw_img_dir, "*.jpg"))
+#     com_names = glob.glob(os.path.join(com_dir, "*.jpg"))
+#     return len(img_names) == len(com_names) and os.path.exists(bg_name)
+# extract background and combined image
+# need pre-processed "gt_imgs" and "segmaps"
+def extract_bg_img_job(video_name, nerf=False, idx=None, total=None, background_method='knn', device="cpu", total_gpus=0, mix_bg=True):
+    try:
+        bg_prefix_name = f"bg{BG_NAME_MAP[background_method]}"
+        com_prefix_name = f"com{BG_NAME_MAP[background_method]}"
+        if "cuda" in device:
+            # determine which cuda index from subprocess id
+            pname = multiprocessing.current_process().name
+            pid = int(pname.rsplit("-", 1)[-1]) - 1
+            cuda_id = pid % total_gpus
+            device = f"cuda:{cuda_id}"
+        if nerf: # single video
+            raw_img_dir = video_name.replace(".mp4", "/gt_imgs/").replace("/raw/","/processed/")
+        else: # whole dataset
+            raw_img_dir = video_name.replace(".mp4", "").replace("/video/", "/gt_imgs/")
+        if nerf:
+            bg_name = video_name.replace("/raw/", "/processed/").replace(".mp4", f"/{bg_prefix_name}.jpg")
+        else:
+            bg_name = video_name.replace("/video/", f"/{bg_prefix_name}_img/").replace(".mp4", ".jpg")
+        # com_dir = raw_img_dir.replace("/gt_imgs/", f"/{com_prefix_name}_imgs/")
+        # if check_bg_img_job_finished(raw_img_dir=raw_img_dir, bg_name=bg_name, com_dir=com_dir):
+        #     print(f"Already finished, skip {raw_img_dir} ")
+        #     return 0
+        img_names = glob.glob(os.path.join(raw_img_dir, "*.jpg"))
+        img_lst = []
+        for img_name in img_names:
+            img = cv2.imread(img_name)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            img_lst.append(img)
+        segmap_mask_lst = []
+        for img_name in img_names:
+            segmap_img_name = img_name.replace("/gt_imgs/", "/segmaps/").replace(".jpg", ".png")
+            segmap_img = load_rgb_image_to_path(segmap_img_name)
+            segmap_mask = decode_segmap_mask_from_image(segmap_img)
+            segmap_mask_lst.append(segmap_mask)
+        bg_img = extract_background(img_lst, segmap_mask_lst, method=background_method, device=device, mix_bg=mix_bg)
+        save_rgb_image_to_path(bg_img, bg_name)
+        for i, img_name in enumerate(img_names):
+            com_img = img_lst[i].copy()
+            segmap = segmap_mask_lst[i]
+            bg_part = segmap[0].astype(bool)[..., None].repeat(3, axis=-1)
+            com_img[bg_part] = bg_img[bg_part]
+            com_name = img_name.replace("/gt_imgs/", f"/{com_prefix_name}_imgs/")
+            save_rgb_image_to_path(com_img, com_name)
+        return 0
+    except Exception as e:
+        print(str(type(e)), e)
+        traceback.print_exc(e)
+        return 1
+def out_exist_job(vid_name, background_method='knn', only_bg_img=False):
+    com_prefix_name = f"com{BG_NAME_MAP[background_method]}"
+    img_dir = vid_name.replace("/video/", "/gt_imgs/").replace(".mp4", "")
+    out_dir1 = img_dir.replace("/gt_imgs/", "/head_imgs/")
+    out_dir2 = img_dir.replace("/gt_imgs/", f"/{com_prefix_name}_imgs/")
+    if not only_bg_img:
+        if os.path.exists(img_dir) and os.path.exists(out_dir1) and os.path.exists(out_dir1) and os.path.exists(out_dir2) :
+            num_frames = len(os.listdir(img_dir))
+            if len(os.listdir(out_dir1)) == num_frames and len(os.listdir(out_dir2)) == num_frames:
+                return None
+            else:
+                return vid_name
+        else:
+            return vid_name
+    else:
+        if os.path.exists(img_dir) and os.path.exists(out_dir2):
+            num_frames = len(os.listdir(img_dir))
+            if len(os.listdir(out_dir2)) == num_frames:
+                return None
+            else:
+                return vid_name
+        else:
+            return vid_name
+def get_todo_vid_names(vid_names, background_method='knn', only_bg_img=False):
+    if len(vid_names) == 1: # nerf
+        return vid_names
+    todo_vid_names = []
+    fn_args = [(vid_name, background_method, only_bg_img) for vid_name in vid_names]
+    for i, res in multiprocess_run_tqdm(out_exist_job, fn_args, num_workers=16, desc="checking todo videos..."):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
+    parser.add_argument("--ds_name", default='CelebV-HQ')
+    parser.add_argument("--num_workers", default=48, type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--load_names", action="store_true")
+    parser.add_argument("--background_method", choices=['knn', 'mat', 'ddnm', 'lama'], type=str, default='knn')
+    parser.add_argument("--total_gpus", default=0, type=int) # zero gpus means utilizing cpu
+    parser.add_argument("--only_bg_img", action="store_true")
+    parser.add_argument("--no_mix_bg", action="store_true")
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    load_names = args.load_names
+    background_method = args.background_method
+    total_gpus = args.total_gpus
+    only_bg_img = args.only_bg_img
+    mix_bg = not args.no_mix_bg
+    devices = os.environ.get('CUDA_VISIBLE_DEVICES', '').split(",")
+    for d in devices[:total_gpus]:
+        os.system(f'pkill -f "voidgpu{d}"')
+    if ds_name.lower() == 'nerf': # 处理单个视频
+        vid_names = [vid_dir]
+        out_names = [video_name.replace("/raw/", "/processed/").replace(".mp4","_lms.npy") for video_name in vid_names]
+    else: # 处理整个数据集
+        if ds_name in ['lrs3_trainval']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+        elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*.mp4")
+        elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        elif ds_name in ["RAVDESS", 'VFHQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        else:
+            raise NotImplementedError()
+        vid_names_path = os.path.join(vid_dir, "vid_names.pkl")
+        if os.path.exists(vid_names_path) and load_names:
+            print(f"loading vid names from {vid_names_path}")
+            vid_names = load_file(vid_names_path)
+        else:
+            vid_names = multiprocess_glob(vid_name_pattern)
+        vid_names = sorted(vid_names)
+        print(f"saving vid names to {vid_names_path}")
+        save_file(vid_names_path, vid_names)
+    vid_names = sorted(vid_names)
+    random.seed(args.seed)
+    random.shuffle(vid_names)
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        vid_names = get_todo_vid_names(vid_names, background_method, only_bg_img)
+    print(f"todo videos number: {len(vid_names)}")
+    # exit()
+    device = "cuda" if total_gpus > 0 else "cpu"
+    if only_bg_img:
+        extract_job = extract_bg_img_job
+        fn_args = [(vid_name,ds_name=='nerf',i,len(vid_names), background_method, device, total_gpus, mix_bg) for i, vid_name in enumerate(vid_names)]
+    else:
+        extract_job = extract_segment_job
+        fn_args = [(vid_name,ds_name=='nerf',i,len(vid_names), background_method, device, total_gpus, mix_bg) for i, vid_name in enumerate(vid_names)]
+    for vid_name in multiprocess_run_tqdm(extract_job, fn_args, desc=f"Root process {args.process_id}:  segment images", num_workers=args.num_workers):
+        pass

data_gen/utils/process_video/fit_3dmm_landmark.py ADDED Viewed

	@@ -0,0 +1,565 @@

+# This is a script for efficienct 3DMM coefficient extraction.
+# It could reconstruct accurate 3D face in real-time.
+# It is built upon BFM 2009 model and mediapipe landmark extractor.
+# It is authored by ZhenhuiYe (zhenhuiye@zju.edu.cn), free to contact him for any suggestion on improvement!
+from numpy.core.numeric import require
+from numpy.lib.function_base import quantile
+import torch
+import torch.nn.functional as F
+import copy
+import numpy as np
+import random
+import pickle
+import os
+import sys
+import cv2
+import argparse
+import tqdm
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.mp_feature_extractors.face_landmarker import MediapipeLandmarker, read_video_to_frames
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+from deep_3drecon.secc_renderer import SECC_Renderer
+from utils.commons.os_utils import multiprocess_glob
+face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+            camera_distance=10, focal=1015, keypoint_mode='mediapipe')
+face_model.to(torch.device("cuda:0"))
+dir_path = os.path.dirname(os.path.realpath(__file__))
+def draw_axes(img, pitch, yaw, roll, tx, ty, size=50):
+    # yaw = -yaw
+    pitch = - pitch
+    roll = - roll
+    rotation_matrix = cv2.Rodrigues(np.array([pitch, yaw, roll]))[0].astype(np.float64)
+    axes_points = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, 0],
+        [0, 0, 1, 0]
+    ], dtype=np.float64)
+    axes_points = rotation_matrix @ axes_points
+    axes_points = (axes_points[:2, :] * size).astype(int)
+    axes_points[0, :] = axes_points[0, :] + tx
+    axes_points[1, :] = axes_points[1, :] + ty
+    new_img = img.copy()
+    cv2.line(new_img, tuple(axes_points[:, 3].ravel()), tuple(axes_points[:, 0].ravel()), (255, 0, 0), 3)
+    cv2.line(new_img, tuple(axes_points[:, 3].ravel()), tuple(axes_points[:, 1].ravel()), (0, 255, 0), 3)
+    cv2.line(new_img, tuple(axes_points[:, 3].ravel()), tuple(axes_points[:, 2].ravel()), (0, 0, 255), 3)
+    return new_img
+def save_file(name, content):
+    with open(name, "wb") as f:
+        pickle.dump(content, f)
+def load_file(name):
+    with open(name, "rb") as f:
+        content = pickle.load(f)
+    return content
+def cal_lap_loss(in_tensor):
+    # [T, 68, 2]
+    t = in_tensor.shape[0]
+    in_tensor = in_tensor.reshape([t, -1]).permute(1,0).unsqueeze(1) # [c, 1, t]
+    in_tensor = torch.cat([in_tensor[:, :, 0:1], in_tensor, in_tensor[:, :, -1:]], dim=-1)
+    lap_kernel = torch.Tensor((-0.5, 1.0, -0.5)).reshape([1,1,3]).float().to(in_tensor.device) # [1, 1, kw]
+    loss_lap = 0
+    out_tensor = F.conv1d(in_tensor, lap_kernel)
+    loss_lap += torch.mean(out_tensor**2)
+    return loss_lap
+def cal_vel_loss(ldm):
+    # [B, 68, 2]
+    vel = ldm[1:] - ldm[:-1]
+    return torch.mean(torch.abs(vel))
+def cal_lan_loss(proj_lan, gt_lan):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan)** 2
+    # use the ldm weights from deep3drecon, see deep_3drecon/deep_3drecon_models/losses.py
+    weights = torch.zeros_like(loss)
+    weights = torch.ones_like(loss)
+    weights[:, 36:48, :] = 3 # eye 12 points
+    weights[:, -8:, :] =  3 # inner lip 8 points
+    weights[:, 28:31, :] =  3 # nose 3 points
+    loss = loss * weights
+    return torch.mean(loss)
+def cal_lan_loss_mp(proj_lan, gt_lan, mean:bool=True):
+    # [B, 68, 2]
+    loss = (proj_lan - gt_lan).pow(2)
+    # loss = (proj_lan - gt_lan).abs()
+    unmatch_mask = [ 93, 127, 132, 234, 323, 356, 361, 454]
+    upper_eye = [161,160,159,158,157] + [388,387,386,385,384]
+    eye = [33,246,161,160,159,158,157,173,133,155,154,153,145,144,163,7] + [263,466,388,387,386,385,384,398,362,382,381,380,374,373,390,249]
+    inner_lip = [78,191,80,81,82,13,312,311,310,415,308,324,318,402,317,14,87,178,88,95]
+    outer_lip = [61,185,40,39,37,0,267,269,270,409,291,375,321,405,314,17,84,181,91,146]
+    weights = torch.ones_like(loss)
+    weights[:, eye] = 3
+    weights[:, upper_eye] = 20
+    weights[:, inner_lip] = 5
+    weights[:, outer_lip] = 5
+    weights[:, unmatch_mask] = 0
+    loss = loss * weights
+    if mean:
+        loss = torch.mean(loss)
+    return loss
+def cal_acceleration_loss(trans):
+    vel = trans[1:] - trans[:-1]
+    acc = vel[1:] - vel[:-1]
+    return torch.mean(torch.abs(acc))
+def cal_acceleration_ldm_loss(ldm):
+    # [B, 68, 2]
+    vel = ldm[1:] - ldm[:-1]
+    acc = vel[1:] - vel[:-1]
+    lip_weight = 0.25 # we dont want smooth the lip too much
+    acc[48:68] *= lip_weight
+    return torch.mean(torch.abs(acc))
+def set_requires_grad(tensor_list):
+    for tensor in tensor_list:
+        tensor.requires_grad = True
+@torch.enable_grad()
+def fit_3dmm_for_a_video(
+    video_name,
+    nerf=False, # use the file name convention for GeneFace++
+    id_mode='global',
+    debug=False,
+    keypoint_mode='mediapipe',
+    large_yaw_threshold=9999999.9,
+    save=True
+) -> bool: # True: good, False: bad
+    assert video_name.endswith(".mp4"), "this function only support video as input"
+    if id_mode == 'global':
+        LAMBDA_REG_ID = 0.2
+        LAMBDA_REG_EXP = 0.6
+        LAMBDA_REG_LAP = 1.0
+        LAMBDA_REG_VEL_ID = 0.0 # laplcaian is all you need for temporal consistency
+        LAMBDA_REG_VEL_EXP = 0.0 # laplcaian is all you need for temporal consistency
+    else:
+        LAMBDA_REG_ID = 0.3
+        LAMBDA_REG_EXP = 0.05
+        LAMBDA_REG_LAP = 1.0
+        LAMBDA_REG_VEL_ID = 0.0 # laplcaian is all you need for temporal consistency
+        LAMBDA_REG_VEL_EXP = 0.0 # laplcaian is all you need for temporal consistency
+    frames = read_video_to_frames(video_name) # [T, H, W, 3]
+    img_h, img_w = frames.shape[1], frames.shape[2]
+    assert img_h == img_w
+    num_frames = len(frames)
+    if nerf: # single video
+        lm_name = video_name.replace("/raw/", "/processed/").replace(".mp4","/lms_2d.npy")
+    else:
+        lm_name = video_name.replace("/video/", "/lms_2d/").replace(".mp4", "_lms.npy")
+    if os.path.exists(lm_name):
+        lms = np.load(lm_name)
+    else:
+        print(f"lms_2d file not found, try to extract it from video... {lm_name}")
+        try:
+            landmarker = MediapipeLandmarker()
+            img_lm478, vid_lm478 = landmarker.extract_lm478_from_frames(frames, anti_smooth_factor=20)
+            lms = landmarker.combine_vid_img_lm478_to_lm478(img_lm478, vid_lm478)
+        except Exception as e:
+            print(e)
+            return False
+        if lms is None:
+            print(f"get None lms_2d, please check whether each frame has one head, exiting... {lm_name}")
+            return False
+    lms = lms[:, :468, :]
+    lms = torch.FloatTensor(lms).cuda()
+    lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+    if keypoint_mode == 'mediapipe':
+        # default
+        cal_lan_loss_fn = cal_lan_loss_mp
+        if nerf: # single video
+            out_name = video_name.replace("/raw/", "/processed/").replace(".mp4", "/coeff_fit_mp.npy")
+        else:
+            out_name = video_name.replace("/video/", "/coeff_fit_mp/").replace(".mp4", "_coeff_fit_mp.npy")
+    else:
+        # lm68 is less accurate than mp
+        cal_lan_loss_fn = cal_lan_loss
+        if nerf: # single video
+            out_name = video_name.replace("/raw/", "/processed/").replace(".mp4", "_coeff_fit_lm68.npy")
+        else:
+            out_name = video_name.replace("/video/", "/coeff_fit_lm68/").replace(".mp4", "_coeff_fit_lm68.npy")
+    try:
+        os.makedirs(os.path.dirname(out_name), exist_ok=True)
+    except:
+        pass
+    id_dim, exp_dim = 80, 64
+    sel_ids = np.arange(0, num_frames, 40)
+    h = w = face_model.center * 2
+    img_scale_factor = img_h / h
+    lms /= img_scale_factor # rescale lms into [0,224]
+    if id_mode == 'global':
+        # default choice by GeneFace++ and later works
+        id_para = lms.new_zeros((1, id_dim), requires_grad=True)
+    elif id_mode == 'finegrained':
+        # legacy choice by GeneFace1 (ICLR 2023)
+        id_para = lms.new_zeros((num_frames, id_dim), requires_grad=True)
+    else: raise NotImplementedError(f"id mode {id_mode} not supported! we only support global or finegrained.")
+    exp_para = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    euler_angle = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans = lms.new_zeros((num_frames, 3), requires_grad=True)
+    set_requires_grad([id_para, exp_para, euler_angle, trans])
+    optimizer_idexp = torch.optim.Adam([id_para, exp_para], lr=.1)
+    optimizer_frame = torch.optim.Adam([euler_angle, trans], lr=.1)
+    # 其他参数初始化，先训练euler和trans
+    for _ in range(200):
+        if id_mode == 'global':
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para.expand((num_frames, id_dim)), exp_para, euler_angle, trans)
+        else:
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para, exp_para, euler_angle, trans)
+        loss_lan = cal_lan_loss_fn(proj_geo[:, :, :2], lms.detach())
+        loss = loss_lan
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    for param_group in optimizer_frame.param_groups:
+        param_group['lr'] = 0.1
+    # "jointly roughly training id exp euler trans"
+    for _ in range(200):
+        ret = {}
+        if id_mode == 'global':
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para.expand((num_frames, id_dim)), exp_para, euler_angle, trans, ret)
+        else:
+            proj_geo = face_model.compute_for_landmark_fit(
+                id_para, exp_para, euler_angle, trans, ret)
+        loss_lan = cal_lan_loss_fn(
+            proj_geo[:, :, :2], lms.detach())
+        # loss_lap = cal_lap_loss(proj_geo)
+        # laplacian对euler影响不大，但是对trans的提升很大
+        loss_lap = cal_lap_loss(id_para) + cal_lap_loss(exp_para) + cal_lap_loss(euler_angle) * 0.3 + cal_lap_loss(trans) * 0.3
+        loss_regid = torch.mean(id_para*id_para) # 正则化
+        loss_regexp = torch.mean(exp_para * exp_para)
+        loss_vel_id = cal_vel_loss(id_para)
+        loss_vel_exp = cal_vel_loss(exp_para)
+        loss = loss_lan  + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP  + loss_vel_id * LAMBDA_REG_VEL_ID + loss_vel_exp * LAMBDA_REG_VEL_EXP + loss_lap * LAMBDA_REG_LAP
+        optimizer_idexp.zero_grad()
+        optimizer_frame.zero_grad()
+        loss.backward()
+        optimizer_idexp.step()
+        optimizer_frame.step()
+    # print(f"loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f},")
+    # print(f"euler_abs_mean: {euler_angle.abs().mean().item():.4f}, euler_std: {euler_angle.std().item():.4f}, euler_min: {euler_angle.min().item():.4f}, euler_max: {euler_angle.max().item():.4f}")
+    # print(f"trans_z_mean: {trans[...,2].mean().item():.4f}, trans_z_std: {trans[...,2].std().item():.4f}, trans_min: {trans[...,2].min().item():.4f}, trans_max: {trans[...,2].max().item():.4f}")
+    # start fine training, intialize from the roughly trained results
+    if id_mode == 'global':
+        id_para_ = lms.new_zeros((1, id_dim), requires_grad=False)
+    else:
+        id_para_ = lms.new_zeros((num_frames, id_dim), requires_grad=True)
+    id_para_.data = id_para.data.clone()
+    id_para = id_para_
+    exp_para_ = lms.new_zeros((num_frames, exp_dim), requires_grad=True)
+    exp_para_.data = exp_para.data.clone()
+    exp_para = exp_para_
+    euler_angle_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    euler_angle_.data = euler_angle.data.clone()
+    euler_angle = euler_angle_
+    trans_ = lms.new_zeros((num_frames, 3), requires_grad=True)
+    trans_.data = trans.data.clone()
+    trans = trans_
+    batch_size = 50
+    # "fine fitting the 3DMM in batches"
+    for i in range(int((num_frames-1)/batch_size+1)):
+        if (i+1)*batch_size > num_frames:
+            start_n = num_frames-batch_size
+            sel_ids = np.arange(max(num_frames-batch_size,0), num_frames)
+        else:
+            start_n = i*batch_size
+            sel_ids = np.arange(i*batch_size, i*batch_size+batch_size)
+        sel_lms = lms[sel_ids]
+        if id_mode == 'global':
+            sel_id_para = id_para.expand((sel_ids.shape[0], id_dim))
+        else:
+            sel_id_para = id_para.new_zeros((batch_size, id_dim), requires_grad=True)
+            sel_id_para.data = id_para[sel_ids].clone()
+        sel_exp_para = exp_para.new_zeros(
+            (batch_size, exp_dim), requires_grad=True)
+        sel_exp_para.data = exp_para[sel_ids].clone()
+        sel_euler_angle = euler_angle.new_zeros(
+            (batch_size, 3), requires_grad=True)
+        sel_euler_angle.data = euler_angle[sel_ids].clone()
+        sel_trans = trans.new_zeros((batch_size, 3), requires_grad=True)
+        sel_trans.data = trans[sel_ids].clone()
+        if id_mode == 'global':
+            set_requires_grad([sel_exp_para, sel_euler_angle, sel_trans])
+            optimizer_cur_batch = torch.optim.Adam(
+                [sel_exp_para, sel_euler_angle, sel_trans], lr=0.005)
+        else:
+            set_requires_grad([sel_id_para, sel_exp_para, sel_euler_angle, sel_trans])
+            optimizer_cur_batch = torch.optim.Adam(
+                [sel_id_para, sel_exp_para, sel_euler_angle, sel_trans], lr=0.005)
+        for j in range(50):
+            ret = {}
+            proj_geo = face_model.compute_for_landmark_fit(
+                sel_id_para, sel_exp_para, sel_euler_angle, sel_trans, ret)
+            loss_lan = cal_lan_loss_fn(
+                proj_geo[:, :, :2], lms[sel_ids].detach())
+            # loss_lap = cal_lap_loss(proj_geo)
+            loss_lap = cal_lap_loss(sel_id_para) + cal_lap_loss(sel_exp_para) + cal_lap_loss(sel_euler_angle) * 0.3 + cal_lap_loss(sel_trans) * 0.3
+            loss_vel_id = cal_vel_loss(sel_id_para)
+            loss_vel_exp = cal_vel_loss(sel_exp_para)
+            log_dict = {
+                'loss_vel_id': loss_vel_id,
+                'loss_vel_exp': loss_vel_exp,
+                'loss_vel_euler': cal_vel_loss(sel_euler_angle),
+                'loss_vel_trans': cal_vel_loss(sel_trans),
+            }
+            loss_regid = torch.mean(sel_id_para*sel_id_para) # 正则化
+            loss_regexp = torch.mean(sel_exp_para*sel_exp_para)
+            loss = loss_lan + loss_regid * LAMBDA_REG_ID + loss_regexp * LAMBDA_REG_EXP + loss_lap * LAMBDA_REG_LAP + loss_vel_id * LAMBDA_REG_VEL_ID + loss_vel_exp * LAMBDA_REG_VEL_EXP
+            optimizer_cur_batch.zero_grad()
+            loss.backward()
+            optimizer_cur_batch.step()
+        if debug:
+            print(f"batch {i} | loss_lan: {loss_lan.item():.2f}, loss_reg_id: {loss_regid.item():.2f},loss_reg_exp: {loss_regexp.item():.2f},loss_lap_ldm:{loss_lap.item():.4f}")
+            print("|--------" + ', '.join([f"{k}: {v:.4f}" for k,v in log_dict.items()]))
+        if id_mode != 'global':
+            id_para[sel_ids].data = sel_id_para.data.clone()
+        exp_para[sel_ids].data = sel_exp_para.data.clone()
+        euler_angle[sel_ids].data = sel_euler_angle.data.clone()
+        trans[sel_ids].data = sel_trans.data.clone()
+    coeff_dict = {'id': id_para.detach().cpu().numpy(), 'exp': exp_para.detach().cpu().numpy(),
+                'euler': euler_angle.detach().cpu().numpy(), 'trans': trans.detach().cpu().numpy()}
+    # filter data by side-view pose
+    # bad_yaw = False
+    # yaws = [] # not so accurate
+    # for index in range(coeff_dict["trans"].shape[0]):
+    #     yaw = coeff_dict["euler"][index][1]
+    #     yaw = np.abs(yaw)
+    #     yaws.append(yaw)
+    #     if yaw > large_yaw_threshold:
+    #         bad_yaw = True
+    if debug:
+        import imageio
+        from utils.visualization.vis_cam3d.camera_pose_visualizer import CameraPoseVisualizer
+        from data_util.face3d_helper import Face3DHelper
+        from data_gen.utils.process_video.extract_blink import get_eye_area_percent
+        face3d_helper = Face3DHelper('deep_3drecon/BFM', keypoint_mode='mediapipe')
+        t = coeff_dict['exp'].shape[0]
+        if len(coeff_dict['id']) == 1:
+            coeff_dict['id'] = np.repeat(coeff_dict['id'], t, axis=0)
+        idexp_lm3d = face3d_helper.reconstruct_idexp_lm3d_np(coeff_dict['id'], coeff_dict['exp']).reshape([t, -1])
+        cano_lm3d = idexp_lm3d / 10 + face3d_helper.key_mean_shape.squeeze().reshape([1, -1]).cpu().numpy()
+        cano_lm3d = cano_lm3d.reshape([t, -1, 3])
+        WH = 512
+        cano_lm3d = (cano_lm3d * WH/2 + WH/2).astype(int)
+        with torch.no_grad():
+            rot = ParametricFaceModel.compute_rotation(euler_angle)
+            extrinsic = torch.zeros([rot.shape[0], 4, 4]).to(rot.device)
+            extrinsic[:, :3,:3] = rot
+            extrinsic[:, :3, 3] = trans # / 10
+            extrinsic[:, 3, 3] = 1
+        extrinsic = extrinsic.cpu().numpy()
+        xy_camera_visualizer = CameraPoseVisualizer(xlim=[extrinsic[:,0,3].min().item()-0.5,extrinsic[:,0,3].max().item()+0.5],ylim=[extrinsic[:,1,3].min().item()-0.5,extrinsic[:,1,3].max().item()+0.5], zlim=[extrinsic[:,2,3].min().item()-0.5,extrinsic[:,2,3].max().item()+0.5], view_mode='xy')
+        xz_camera_visualizer = CameraPoseVisualizer(xlim=[extrinsic[:,0,3].min().item()-0.5,extrinsic[:,0,3].max().item()+0.5],ylim=[extrinsic[:,1,3].min().item()-0.5,extrinsic[:,1,3].max().item()+0.5], zlim=[extrinsic[:,2,3].min().item()-0.5,extrinsic[:,2,3].max().item()+0.5], view_mode='xz')
+        if nerf:
+            debug_name = video_name.replace("/raw/", "/processed/").replace(".mp4", "/debug_fit_3dmm.mp4")
+        else:
+            debug_name = video_name.replace("/video/", "/coeff_fit_debug/").replace(".mp4", "_debug.mp4")
+        try:
+            os.makedirs(os.path.dirname(debug_name), exist_ok=True)
+        except: pass
+        writer = imageio.get_writer(debug_name, fps=25)
+        if id_mode == 'global':
+            id_para = id_para.repeat([exp_para.shape[0], 1])
+        proj_geo = face_model.compute_for_landmark_fit(id_para, exp_para, euler_angle, trans)
+        lm68s = proj_geo[:,:,:2].detach().cpu().numpy()  # [T, 68,2]
+        lm68s = lm68s * img_scale_factor
+        lms = lms * img_scale_factor
+        lm68s[..., 1] = img_h - lm68s[..., 1] # flip the height axis
+        lms[..., 1] = img_h - lms[..., 1] # flip the height axis
+        lm68s = lm68s.astype(int)
+        for i in tqdm.trange(min(250, len(frames)), desc=f'rendering debug video to {debug_name}..'):
+            xy_cam3d_img = xy_camera_visualizer.extrinsic2pyramid(extrinsic[i], focal_len_scaled=0.25)
+            xy_cam3d_img = cv2.resize(xy_cam3d_img, (512,512))
+            xz_cam3d_img = xz_camera_visualizer.extrinsic2pyramid(extrinsic[i], focal_len_scaled=0.25)
+            xz_cam3d_img = cv2.resize(xz_cam3d_img, (512,512))
+            img = copy.deepcopy(frames[i])
+            img2 = copy.deepcopy(frames[i])
+            img = draw_axes(img, euler_angle[i,0].item(), euler_angle[i,1].item(), euler_angle[i,2].item(), lm68s[i][4][0].item(), lm68s[i, 4][1].item(), size=50)
+            gt_lm_color = (255, 0, 0)
+            for lm in lm68s[i]:
+                img = cv2.circle(img, lm, 1, (0, 0, 255), thickness=-1) # blue
+            for gt_lm in lms[i]:
+                img2 = cv2.circle(img2, gt_lm.cpu().numpy().astype(int), 2, gt_lm_color, thickness=1)
+            cano_lm3d_img = np.ones([WH, WH, 3], dtype=np.uint8) * 255
+            for j in range(len(cano_lm3d[i])):
+                x, y, _ = cano_lm3d[i, j]
+                color = (255,0,0)
+                cano_lm3d_img = cv2.circle(cano_lm3d_img, center=(x,y), radius=3, color=color, thickness=-1)
+            cano_lm3d_img = cv2.flip(cano_lm3d_img, 0)
+            _, secc_img = secc_renderer(id_para[0:1], exp_para[i:i+1], euler_angle[i:i+1]*0, trans[i:i+1]*0)
+            secc_img = (secc_img +1)*127.5
+            secc_img = F.interpolate(secc_img, size=(img_h, img_w))
+            secc_img = secc_img.permute(0, 2,3,1).int().cpu().numpy()[0]
+            out_img1 = np.concatenate([img, img2, secc_img], axis=1).astype(np.uint8)
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            out_img2 = np.concatenate([xy_cam3d_img, xz_cam3d_img, cano_lm3d_img], axis=1).astype(np.uint8)
+            out_img = np.concatenate([out_img1, out_img2], axis=0)
+            writer.append_data(out_img)
+        writer.close()
+    # if bad_yaw:
+    #     print(f"Skip {video_name} due to TOO LARGE YAW")
+    #     return False
+    if save:
+        np.save(out_name, coeff_dict, allow_pickle=True)
+    return coeff_dict
+def out_exist_job(vid_name):
+    out_name = vid_name.replace("/video/", "/coeff_fit_mp/").replace(".mp4","_coeff_fit_mp.npy")
+    lms_name = vid_name.replace("/video/", "/lms_2d/").replace(".mp4","_lms.npy")
+    if os.path.exists(out_name) or not os.path.exists(lms_name):
+        return None
+    else:
+        return vid_name
+def get_todo_vid_names(vid_names):
+    if len(vid_names) == 1: # single video, nerf
+        return vid_names
+    todo_vid_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, vid_names, num_workers=16):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
+    parser.add_argument("--vid_dir", default='data/raw/videos/May_10s.mp4')
+    parser.add_argument("--ds_name", default='nerf') # 'nerf' | 'CelebV-HQ' | 'TH1KH_512' | etc
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--id_mode", default='global', type=str) # global | finegrained
+    parser.add_argument("--keypoint_mode", default='mediapipe', type=str)
+    parser.add_argument("--large_yaw_threshold", default=9999999.9, type=float) # could be 0.7
+    parser.add_argument("--debug", action='store_true')
+    parser.add_argument("--reset", action='store_true')
+    parser.add_argument("--load_names", action="store_true")
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    load_names = args.load_names
+    print(f"args {args}")
+    if ds_name.lower() == 'nerf': # 处理单个视频
+        vid_names = [vid_dir]
+        out_names = [video_name.replace("/raw/", "/processed/").replace(".mp4","_coeff_fit_mp.npy") for video_name in vid_names]
+    else: # 处理整个数据集
+        if ds_name in ['lrs3_trainval']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+        elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*.mp4")
+        elif ds_name in ['lrs2', 'lrs3', 'voxceleb2', 'CMLR']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        elif ds_name in ["RAVDESS", 'VFHQ']:
+            vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        else:
+            raise NotImplementedError()
+        vid_names_path = os.path.join(vid_dir, "vid_names.pkl")
+        if os.path.exists(vid_names_path) and load_names:
+            print(f"loading vid names from {vid_names_path}")
+            vid_names = load_file(vid_names_path)
+        else:
+            vid_names = multiprocess_glob(vid_name_pattern)
+        vid_names = sorted(vid_names)
+        print(f"saving vid names to {vid_names_path}")
+        save_file(vid_names_path, vid_names)
+        out_names = [video_name.replace("/video/", "/coeff_fit_mp/").replace(".mp4","_coeff_fit_mp.npy") for video_name in vid_names]
+    print(vid_names[:10])
+    random.seed(args.seed)
+    random.shuffle(vid_names)
+    face_model = ParametricFaceModel(bfm_folder='deep_3drecon/BFM',
+                camera_distance=10, focal=1015, keypoint_mode=args.keypoint_mode)
+    face_model.to(torch.device("cuda:0"))
+    secc_renderer = SECC_Renderer(512)
+    secc_renderer.to("cuda:0")
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if not args.reset:
+        vid_names = get_todo_vid_names(vid_names)
+    failed_img_names = []
+    for i in tqdm.trange(len(vid_names), desc=f"process {process_id}: fitting 3dmm ..."):
+        img_name = vid_names[i]
+        try:
+            is_person_specific_data = ds_name=='nerf'
+            success = fit_3dmm_for_a_video(img_name, is_person_specific_data, args.id_mode, args.debug, large_yaw_threshold=args.large_yaw_threshold)
+            if not success:
+                failed_img_names.append(img_name)
+        except Exception as e:
+            print(img_name, e)
+            failed_img_names.append(img_name)
+        print(f"finished {i + 1} / {len(vid_names)} = {(i + 1) / len(vid_names):.4f}, failed {len(failed_img_names)} / {i + 1} = {len(failed_img_names) / (i + 1):.4f}")
+        sys.stdout.flush()
+    print(f"all failed image names: {failed_img_names}")
+    print(f"All finished!")

data_gen/utils/process_video/inpaint_torso_imgs.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import cv2
+import os
+import numpy as np
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from scipy.ndimage import binary_erosion, binary_dilation
+from tasks.eg3ds.loss_utils.segment_loss.mp_segmenter import MediapipeSegmenter
+seg_model = MediapipeSegmenter()
+def inpaint_torso_job(video_name, idx=None, total=None):
+    raw_img_dir = video_name.replace(".mp4", "").replace("/video/","/gt_imgs/")
+    img_names = glob.glob(os.path.join(raw_img_dir, "*.jpg"))
+    for image_path in tqdm.tqdm(img_names):
+        # read ori image
+        ori_image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        segmap = seg_model._cal_seg_map(cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB))
+        head_part = (segmap[1] + segmap[3] + segmap[5]).astype(np.bool)
+        torso_part = (segmap[4]).astype(np.bool)
+        neck_part = (segmap[2]).astype(np.bool)
+        bg_part = segmap[0].astype(np.bool)
+        head_image = cv2.imread(image_path.replace("/gt_imgs/", "/head_imgs/"), cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        torso_image = cv2.imread(image_path.replace("/gt_imgs/", "/torso_imgs/"), cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        bg_image = cv2.imread(image_path.replace("/gt_imgs/", "/bg_imgs/"), cv2.IMREAD_UNCHANGED) # [H, W, 3]
+        # head_part = (head_image[...,0] != 0) & (head_image[...,1] != 0) & (head_image[...,2] != 0)
+        # torso_part = (torso_image[...,0] != 0) & (torso_image[...,1] != 0) & (torso_image[...,2] != 0)
+        # bg_part = (bg_image[...,0] != 0) & (bg_image[...,1] != 0) & (bg_image[...,2] != 0)
+        # get gt image
+        gt_image = ori_image.copy()
+        gt_image[bg_part] = bg_image[bg_part]
+        cv2.imwrite(image_path.replace('ori_imgs', 'gt_imgs'), gt_image)
+        # get torso image
+        torso_image = gt_image.copy() # rgb
+        torso_image[head_part] = 0
+        torso_alpha = 255 * np.ones((gt_image.shape[0], gt_image.shape[1], 1), dtype=np.uint8) # alpha
+        # torso part "vertical" in-painting...
+        L = 8 + 1
+        torso_coords = np.stack(np.nonzero(torso_part), axis=-1) # [M, 2]
+        # lexsort: sort 2D coords first by y then by x,
+        # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+        inds = np.lexsort((torso_coords[:, 0], torso_coords[:, 1]))
+        torso_coords = torso_coords[inds]
+        # choose the top pixel for each column
+        u, uid, ucnt = np.unique(torso_coords[:, 1], return_index=True, return_counts=True)
+        top_torso_coords = torso_coords[uid] # [m, 2]
+        # only keep top-is-head pixels
+        top_torso_coords_up = top_torso_coords.copy() - np.array([1, 0]) # [N, 2]
+        mask = head_part[tuple(top_torso_coords_up.T)]
+        if mask.any():
+            top_torso_coords = top_torso_coords[mask]
+            # get the color
+            top_torso_colors = gt_image[tuple(top_torso_coords.T)] # [m, 3]
+            # construct inpaint coords (vertically up, or minus in x)
+            inpaint_torso_coords = top_torso_coords[None].repeat(L, 0) # [L, m, 2]
+            inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+            inpaint_torso_coords += inpaint_offsets
+            inpaint_torso_coords = inpaint_torso_coords.reshape(-1, 2) # [Lm, 2]
+            inpaint_torso_colors = top_torso_colors[None].repeat(L, 0) # [L, m, 3]
+            darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+            inpaint_torso_colors = (inpaint_torso_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+            # set color
+            torso_image[tuple(inpaint_torso_coords.T)] = inpaint_torso_colors
+            inpaint_torso_mask = np.zeros_like(torso_image[..., 0]).astype(bool)
+            inpaint_torso_mask[tuple(inpaint_torso_coords.T)] = True
+        else:
+            inpaint_torso_mask = None
+        # neck part "vertical" in-painting...
+        push_down = 4
+        L = 48 + push_down + 1
+        neck_part = binary_dilation(neck_part, structure=np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=bool), iterations=3)
+        neck_coords = np.stack(np.nonzero(neck_part), axis=-1) # [M, 2]
+        # lexsort: sort 2D coords first by y then by x,
+        # ref: https://stackoverflow.com/questions/2706605/sorting-a-2d-numpy-array-by-multiple-axes
+        inds = np.lexsort((neck_coords[:, 0], neck_coords[:, 1]))
+        neck_coords = neck_coords[inds]
+        # choose the top pixel for each column
+        u, uid, ucnt = np.unique(neck_coords[:, 1], return_index=True, return_counts=True)
+        top_neck_coords = neck_coords[uid] # [m, 2]
+        # only keep top-is-head pixels
+        top_neck_coords_up = top_neck_coords.copy() - np.array([1, 0])
+        mask = head_part[tuple(top_neck_coords_up.T)]
+        top_neck_coords = top_neck_coords[mask]
+        # push these top down for 4 pixels to make the neck inpainting more natural...
+        offset_down = np.minimum(ucnt[mask] - 1, push_down)
+        top_neck_coords += np.stack([offset_down, np.zeros_like(offset_down)], axis=-1)
+        # get the color
+        top_neck_colors = gt_image[tuple(top_neck_coords.T)] # [m, 3]
+        # construct inpaint coords (vertically up, or minus in x)
+        inpaint_neck_coords = top_neck_coords[None].repeat(L, 0) # [L, m, 2]
+        inpaint_offsets = np.stack([-np.arange(L), np.zeros(L, dtype=np.int32)], axis=-1)[:, None] # [L, 1, 2]
+        inpaint_neck_coords += inpaint_offsets
+        inpaint_neck_coords = inpaint_neck_coords.reshape(-1, 2) # [Lm, 2]
+        inpaint_neck_colors = top_neck_colors[None].repeat(L, 0) # [L, m, 3]
+        darken_scaler = 0.98 ** np.arange(L).reshape(L, 1, 1) # [L, 1, 1]
+        inpaint_neck_colors = (inpaint_neck_colors * darken_scaler).reshape(-1, 3) # [Lm, 3]
+        # set color
+        torso_image[tuple(inpaint_neck_coords.T)] = inpaint_neck_colors
+        # apply blurring to the inpaint area to avoid vertical-line artifects...
+        inpaint_mask = np.zeros_like(torso_image[..., 0]).astype(bool)
+        inpaint_mask[tuple(inpaint_neck_coords.T)] = True
+        blur_img = torso_image.copy()
+        blur_img = cv2.GaussianBlur(blur_img, (5, 5), cv2.BORDER_DEFAULT)
+        torso_image[inpaint_mask] = blur_img[inpaint_mask]
+        # set mask
+        mask = (neck_part | torso_part | inpaint_mask)
+        if inpaint_torso_mask is not None:
+            mask = mask | inpaint_torso_mask
+        torso_image[~mask] = 0
+        torso_alpha[~mask] = 0
+        cv2.imwrite("0.png", np.concatenate([torso_image, torso_alpha], axis=-1))
+    print(f'[INFO] ===== extracted torso and gt images =====')
+def out_exist_job(vid_name):
+    out_dir1 = vid_name.replace("/video/", "/inpaint_torso_imgs/").replace(".mp4","")
+    out_dir2 = vid_name.replace("/video/", "/inpaint_torso_with_bg_imgs/").replace(".mp4","")
+    out_dir3 = vid_name.replace("/video/", "/torso_imgs/").replace(".mp4","")
+    out_dir4 = vid_name.replace("/video/", "/torso_with_bg_imgs/").replace(".mp4","")
+    if os.path.exists(out_dir1) and os.path.exists(out_dir1) and os.path.exists(out_dir2) and os.path.exists(out_dir3) and os.path.exists(out_dir4):
+        num_frames = len(os.listdir(out_dir1))
+        if len(os.listdir(out_dir1)) == num_frames and len(os.listdir(out_dir2)) == num_frames and len(os.listdir(out_dir3)) == num_frames and len(os.listdir(out_dir4)) == num_frames:
+            return None
+        else:
+            return vid_name
+    else:
+        return vid_name
+def get_todo_vid_names(vid_names):
+    todo_vid_names = []
+    for i, res in multiprocess_run_tqdm(out_exist_job, vid_names, num_workers=16):
+        if res is not None:
+            todo_vid_names.append(res)
+    return todo_vid_names
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
+    parser.add_argument("--ds_name", default='CelebV-HQ')
+    parser.add_argument("--num_workers", default=48, type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    parser.add_argument("--reset", action='store_true')
+    inpaint_torso_job('/home/tiger/datasets/raw/CelebV-HQ/video/dgdEr-mXQT4_8.mp4')
+    # args = parser.parse_args()
+    # vid_dir = args.vid_dir
+    # ds_name = args.ds_name
+    # if ds_name in ['lrs3_trainval']:
+    #     mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+    # if ds_name in ['TH1KH_512', 'CelebV-HQ']:
+    #     vid_names = glob.glob(os.path.join(vid_dir, "*.mp4"))
+    # elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']:
+    #     vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+    #     vid_names = glob.glob(vid_name_pattern)
+    # vid_names = sorted(vid_names)
+    # random.seed(args.seed)
+    # random.shuffle(vid_names)
+    # process_id = args.process_id
+    # total_process = args.total_process
+    # if total_process > 1:
+    #     assert process_id <= total_process -1
+    #     num_samples_per_process = len(vid_names) // total_process
+    #     if process_id == total_process:
+    #         vid_names = vid_names[process_id * num_samples_per_process : ]
+    #     else:
+    #         vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    # if not args.reset:
+    #     vid_names = get_todo_vid_names(vid_names)
+    # print(f"todo videos number: {len(vid_names)}")
+    # fn_args = [(vid_name,i,len(vid_names)) for i, vid_name in enumerate(vid_names)]
+    # for vid_name in multiprocess_run_tqdm(inpaint_torso_job ,fn_args, desc=f"Root process {args.process_id}: extracting segment images", num_workers=args.num_workers):
+    #     pass

data_gen/utils/process_video/resample_video_to_25fps_resize_to_512.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os, glob
+import cv2
+from utils.commons.os_utils import multiprocess_glob
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+def get_video_infos(video_path):
+    vid_cap = cv2.VideoCapture(video_path)
+    height = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    width = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    fps = vid_cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(vid_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    return {'height': height, 'width': width, 'fps': fps, 'total_frames':total_frames}
+def extract_img_job(video_name:str):
+    out_path = video_name.replace("/video_raw/","/video/",1)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    vid_info = get_video_infos(video_name)
+    assert vid_info['width'] == vid_info['height']
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},scale=w=512:h=512 -q:v 1 -c:v libx264 -pix_fmt yuv420p -b:v 2000k -v quiet -y {out_path}'
+    os.system(cmd)
+def extract_img_job_crop(video_name:str):
+    out_path = video_name.replace("/video_raw/","/video/",1)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    vid_info = get_video_infos(video_name)
+    wh = min(vid_info['width'], vid_info['height'])
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},crop={wh}:{wh},scale=w=512:h=512 -q:v 1 -c:v libx264 -pix_fmt yuv420p -b:v 2000k -v quiet -y {out_path}'
+    os.system(cmd)
+def extract_img_job_crop_ravdess(video_name:str):
+    out_path = video_name.replace("/video_raw/","/video/",1)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},crop=720:720,scale=w=512:h=512 -q:v 1 -c:v libx264 -pix_fmt yuv420p -b:v 2000k -v quiet -y {out_path}'
+    os.system(cmd)
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video_raw/')
+    parser.add_argument("--ds_name", default='CelebV-HQ')
+    parser.add_argument("--num_workers", default=32, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    args = parser.parse_args()
+    print(f"args {args}")
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    if ds_name in ['lrs3_trainval']:
+        mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+    elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+        vid_names = multiprocess_glob(os.path.join(vid_dir, "*.mp4"))
+    elif ds_name in ['lrs2', 'lrs3', 'voxceleb2', 'CMLR']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        vid_names = multiprocess_glob(vid_name_pattern)
+    elif ds_name in ["RAVDESS", 'VFHQ']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        vid_names = multiprocess_glob(vid_name_pattern)
+    else:
+        raise NotImplementedError()
+    vid_names = sorted(vid_names)
+    print(f"total video number : {len(vid_names)}")
+    print(f"first {vid_names[0]} last {vid_names[-1]}")
+    # exit()
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    if ds_name == "RAVDESS":
+        for i, res in multiprocess_run_tqdm(extract_img_job_crop_ravdess, vid_names, num_workers=args.num_workers, desc="resampling videos"):
+            pass
+    elif ds_name == "CMLR":
+        for i, res in multiprocess_run_tqdm(extract_img_job_crop, vid_names, num_workers=args.num_workers, desc="resampling videos"):
+            pass
+    else:
+        for i, res in multiprocess_run_tqdm(extract_img_job, vid_names, num_workers=args.num_workers, desc="resampling videos"):
+            pass

data_gen/utils/process_video/split_video_to_imgs.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os, glob
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from data_gen.utils.path_converter import PathConverter, pc
+# mp4_names = glob.glob("/home/tiger/datasets/raw/CelebV-HQ/video/*.mp4")
+def extract_img_job(video_name, raw_img_dir=None):
+    if raw_img_dir is not None:
+        out_path = raw_img_dir
+    else:
+        out_path = pc.to(video_name.replace(".mp4", ""), "vid", "gt")
+    os.makedirs(out_path, exist_ok=True)
+    ffmpeg_path = "/usr/bin/ffmpeg"
+    cmd = f'{ffmpeg_path} -i {video_name} -vf fps={25},scale=w=512:h=512 -qmin 1 -q:v 1 -start_number 0 -v quiet {os.path.join(out_path, "%8d.jpg")}'
+    os.system(cmd)
+if __name__ == '__main__':
+    import argparse, glob, tqdm, random
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vid_dir", default='/home/tiger/datasets/raw/CelebV-HQ/video')
+    parser.add_argument("--ds_name", default='CelebV-HQ')
+    parser.add_argument("--num_workers", default=64, type=int)
+    parser.add_argument("--process_id", default=0, type=int)
+    parser.add_argument("--total_process", default=1, type=int)
+    args = parser.parse_args()
+    vid_dir = args.vid_dir
+    ds_name = args.ds_name
+    if ds_name in ['lrs3_trainval']:
+        mp4_name_pattern = os.path.join(vid_dir, "*/*.mp4")
+    elif ds_name in ['TH1KH_512', 'CelebV-HQ']:
+        vid_names = glob.glob(os.path.join(vid_dir, "*.mp4"))
+    elif ds_name in ['lrs2', 'lrs3', 'voxceleb2']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*.mp4")
+        vid_names = glob.glob(vid_name_pattern)
+    elif ds_name in ["RAVDESS", 'VFHQ']:
+        vid_name_pattern = os.path.join(vid_dir, "*/*/*/*.mp4")
+        vid_names = glob.glob(vid_name_pattern)
+    vid_names = sorted(vid_names)
+    process_id = args.process_id
+    total_process = args.total_process
+    if total_process > 1:
+        assert process_id <= total_process -1
+        num_samples_per_process = len(vid_names) // total_process
+        if process_id == total_process:
+            vid_names = vid_names[process_id * num_samples_per_process : ]
+        else:
+            vid_names = vid_names[process_id * num_samples_per_process : (process_id+1) * num_samples_per_process]
+    for i, res in multiprocess_run_tqdm(extract_img_job, vid_names, num_workers=args.num_workers, desc="extracting images"):
+        pass

data_util/face3d_helper.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from scipy.io import loadmat
+from deep_3drecon.deep_3drecon_models.bfm import perspective_projection
+class Face3DHelper(nn.Module):
+    def __init__(self, bfm_dir='deep_3drecon/BFM', keypoint_mode='lm68', use_gpu=True):
+        super().__init__()
+        self.keypoint_mode = keypoint_mode # lm68 | mediapipe
+        self.bfm_dir = bfm_dir
+        self.load_3dmm()
+        if use_gpu: self.to("cuda")
+    def load_3dmm(self):
+        model = loadmat(os.path.join(self.bfm_dir, "BFM_model_front.mat"))
+        self.register_buffer('mean_shape',torch.from_numpy(model['meanshape'].transpose()).float()) # mean face shape. [3*N, 1], N=35709, xyz=3, ==> 3*N=107127
+        mean_shape = self.mean_shape.reshape([-1, 3])
+        # re-center
+        mean_shape = mean_shape - torch.mean(mean_shape, dim=0, keepdims=True)
+        self.mean_shape = mean_shape.reshape([-1, 1])
+        self.register_buffer('id_base',torch.from_numpy(model['idBase']).float()) # identity basis. [3*N,80], we have 80 eigen faces for identity
+        self.register_buffer('exp_base',torch.from_numpy(model['exBase']).float()) # expression basis. [3*N,64], we have 64 eigen faces for expression
+        self.register_buffer('mean_texure',torch.from_numpy(model['meantex'].transpose()).float()) # mean face texture. [3*N,1] (0-255)
+        self.register_buffer('tex_base',torch.from_numpy(model['texBase']).float()) # texture basis. [3*N,80], rgb=3
+        self.register_buffer('point_buf',torch.from_numpy(model['point_buf']).float()) # triangle indices for each vertex that lies in. starts from 1. [N,8] (1-F)
+        self.register_buffer('face_buf',torch.from_numpy(model['tri']).float()) # vertex indices in each triangle. starts from 1. [F,3] (1-N)
+        if self.keypoint_mode == 'mediapipe':
+            self.register_buffer('key_points', torch.from_numpy(np.load("deep_3drecon/BFM/index_mp468_from_mesh35709.npy").astype(np.int64)))
+            unmatch_mask = self.key_points < 0
+            self.key_points[unmatch_mask] = 0
+        else:
+            self.register_buffer('key_points',torch.from_numpy(model['keypoints'].squeeze().astype(np.int_)).long()) # vertex indices of 68 facial landmarks. starts from 1. [68,1]
+        self.register_buffer('key_mean_shape',self.mean_shape.reshape([-1,3])[self.key_points,:])
+        self.register_buffer('key_id_base', self.id_base.reshape([-1,3,80])[self.key_points, :, :].reshape([-1,80]))
+        self.register_buffer('key_exp_base', self.exp_base.reshape([-1,3,64])[self.key_points, :, :].reshape([-1,64]))
+        self.key_id_base_np = self.key_id_base.cpu().numpy()
+        self.key_exp_base_np = self.key_exp_base.cpu().numpy()
+        self.register_buffer('persc_proj', torch.tensor(perspective_projection(focal=1015, center=112)))
+    def split_coeff(self, coeff):
+        """
+        coeff: Tensor[B, T, c=257] or [T, c=257]
+        """
+        ret_dict = {
+            'identity': coeff[..., :80],  # identity, [b, t, c=80]
+            'expression': coeff[..., 80:144],  # expression, [b, t, c=80]
+            'texture': coeff[..., 144:224],  # texture, [b, t, c=80]
+            'euler': coeff[..., 224:227],  # euler euler for pose, [b, t, c=3]
+            'translation':  coeff[..., 254:257], # translation, [b, t, c=3]
+            'gamma': coeff[..., 227:254] # lighting, [b, t, c=27]
+        }
+        return ret_dict
+    def reconstruct_face_mesh(self, id_coeff, exp_coeff):
+        """
+        Generate a pose-independent 3D face mesh!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.mean_shape.squeeze().reshape([1, -1]) # [3N, 1] ==> [1, 3N]
+        id_base, exp_base = self.id_base, self.exp_base # [3*N, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3N] ==> [t,3N]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3N] ==> [t,3N]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        # mean_xyz = self.mean_shape.squeeze().reshape([-1,3]).mean(dim=0) # [1, 3]
+        # face_mesh = face - mean_xyz.unsqueeze(0) # [t,N,3]
+        return face
+    def reconstruct_cano_lm3d(self, id_coeff, exp_coeff):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.key_mean_shape.squeeze().reshape([1, -1]) # [3*68, 1] ==> [1, 3*68]
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        # mean_xyz = self.key_mean_shape.squeeze().reshape([-1,3]).mean(dim=0) # [1, 3]
+        # lm3d = face - mean_xyz.unsqueeze(0) # [t,N,3]
+        return face
+    def reconstruct_lm3d(self, id_coeff, exp_coeff, euler, trans, to_camera=True):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.key_mean_shape.squeeze().reshape([1, -1]) # [3*68, 1] ==> [1, 3*68]
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        rot = self.compute_rotation(euler)
+        # transform
+        lm3d = face @ rot + trans.unsqueeze(1) # [t, N, 3]
+        # to camera
+        if to_camera:
+            lm3d[...,-1] = 10 - lm3d[...,-1]
+        return lm3d
+    def reconstruct_lm2d_nerf(self, id_coeff, exp_coeff, euler, trans):
+        lm2d = self.reconstruct_lm2d(id_coeff, exp_coeff, euler, trans, to_camera=False)
+        lm2d[..., 0] = 1 - lm2d[..., 0]
+        lm2d[..., 1] = 1 - lm2d[..., 1]
+        return lm2d
+    def reconstruct_lm2d(self, id_coeff, exp_coeff, euler, trans, to_camera=True):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        is_btc_flag = True if id_coeff.ndim == 3 else False
+        if is_btc_flag:
+            b,t,_ = id_coeff.shape
+            id_coeff = id_coeff.reshape([b*t,-1])
+            exp_coeff = exp_coeff.reshape([b*t,-1])
+            euler = euler.reshape([b*t,-1])
+            trans = trans.reshape([b*t,-1])
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        mean_face = self.key_mean_shape.squeeze().reshape([1, -1]) # [3*68, 1] ==> [1, 3*68]
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = mean_face + identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        # re-centering the face with mean_xyz, so the face will be in [-1, 1]
+        rot = self.compute_rotation(euler)
+        # transform
+        lm3d = face @ rot + trans.unsqueeze(1) # [t, N, 3]
+        # to camera
+        if to_camera:
+            lm3d[...,-1] = 10 - lm3d[...,-1]
+        # to image_plane
+        lm3d = lm3d @ self.persc_proj
+        lm2d = lm3d[..., :2] / lm3d[..., 2:]
+        # flip
+        lm2d[..., 1] = 224 - lm2d[..., 1]
+        lm2d /= 224
+        if is_btc_flag:
+            return lm2d.reshape([b,t,-1,2])
+        return lm2d
+    def compute_rotation(self, euler):
+        """
+        Return:
+            rot              -- torch.tensor, size (B, 3, 3) pts @ trans_mat
+        Parameters:
+            euler           -- torch.tensor, size (B, 3), radian
+        """
+        batch_size = euler.shape[0]
+        euler = euler.to(self.key_id_base.device)
+        ones = torch.ones([batch_size, 1]).to(self.key_id_base.device)
+        zeros = torch.zeros([batch_size, 1]).to(self.key_id_base.device)
+        x, y, z = euler[:, :1], euler[:, 1:2], euler[:, 2:],
+        rot_x = torch.cat([
+            ones, zeros, zeros,
+            zeros, torch.cos(x), -torch.sin(x),
+            zeros, torch.sin(x), torch.cos(x)
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot_y = torch.cat([
+            torch.cos(y), zeros, torch.sin(y),
+            zeros, ones, zeros,
+            -torch.sin(y), zeros, torch.cos(y)
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot_z = torch.cat([
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z), torch.cos(z), zeros,
+            zeros, zeros, ones
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)
+    def reconstruct_idexp_lm3d(self, id_coeff, exp_coeff):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_coeff = id_coeff.to(self.key_id_base.device)
+        exp_coeff = exp_coeff.to(self.key_id_base.device)
+        id_base, exp_base = self.key_id_base, self.key_exp_base # [3*68, C]
+        identity_diff_face = torch.matmul(id_coeff, id_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = torch.matmul(exp_coeff, exp_base.transpose(0,1)) # [t,c],[c,3*68] ==> [t,3*68]
+        face = identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        lm3d = face * 10
+        return lm3d
+    def reconstruct_idexp_lm3d_np(self, id_coeff, exp_coeff):
+        """
+        Generate 3D landmark with keypoint base!
+        id_coeff: Tensor[T, c=80]
+        exp_coeff: Tensor[T, c=64]
+        """
+        id_base, exp_base = self.key_id_base_np, self.key_exp_base_np # [3*68, C]
+        identity_diff_face = np.dot(id_coeff, id_base.T) # [t,c],[c,3*68] ==> [t,3*68]
+        expression_diff_face = np.dot(exp_coeff, exp_base.T) # [t,c],[c,3*68] ==> [t,3*68]
+        face = identity_diff_face + expression_diff_face # [t,3N]
+        face = face.reshape([face.shape[0], -1, 3]) # [t,N,3]
+        lm3d = face * 10
+        return lm3d
+    def get_eye_mouth_lm_from_lm3d(self, lm3d):
+        eye_lm = lm3d[:, 17:48] # [T, 31, 3]
+        mouth_lm = lm3d[:, 48:68] # [T, 20, 3]
+        return eye_lm, mouth_lm
+    def get_eye_mouth_lm_from_lm3d_batch(self, lm3d):
+        eye_lm = lm3d[:, :, 17:48] # [T, 31, 3]
+        mouth_lm = lm3d[:, :, 48:68] # [T, 20, 3]
+        return eye_lm, mouth_lm
+    def close_mouth_for_idexp_lm3d(self, idexp_lm3d, freeze_as_first_frame=True):
+        idexp_lm3d = idexp_lm3d.reshape([-1, 68,3])
+        num_frames = idexp_lm3d.shape[0]
+        eps = 0.0
+        # [n_landmarks=68,xyz=3], x 代表左右，y代表上下，z代表深度
+        idexp_lm3d[:,49:54, 1] = (idexp_lm3d[:,49:54, 1] + idexp_lm3d[:,range(59,54,-1), 1])/2 + eps * 2
+        idexp_lm3d[:,range(59,54,-1), 1] = (idexp_lm3d[:,49:54, 1] + idexp_lm3d[:,range(59,54,-1), 1])/2 - eps * 2
+        idexp_lm3d[:,61:64, 1] = (idexp_lm3d[:,61:64, 1] + idexp_lm3d[:,range(67,64,-1), 1])/2 + eps
+        idexp_lm3d[:,range(67,64,-1), 1] = (idexp_lm3d[:,61:64, 1] + idexp_lm3d[:,range(67,64,-1), 1])/2 - eps
+        idexp_lm3d[:,49:54, 1] += (0.03 - idexp_lm3d[:,49:54, 1].mean(dim=1) + idexp_lm3d[:,61:64, 1].mean(dim=1)).unsqueeze(1).repeat([1,5])
+        idexp_lm3d[:,range(59,54,-1), 1] += (-0.03 - idexp_lm3d[:,range(59,54,-1), 1].mean(dim=1) + idexp_lm3d[:,range(67,64,-1), 1].mean(dim=1)).unsqueeze(1).repeat([1,5])
+        if freeze_as_first_frame:
+            idexp_lm3d[:, 48:68,] = idexp_lm3d[0, 48:68].unsqueeze(0).clone().repeat([num_frames, 1,1])*0
+        return idexp_lm3d.cpu()
+    def close_eyes_for_idexp_lm3d(self, idexp_lm3d):
+        idexp_lm3d = idexp_lm3d.reshape([-1, 68,3])
+        eps = 0.003
+        idexp_lm3d[:,37:39, 1] = (idexp_lm3d[:,37:39, 1] + idexp_lm3d[:,range(41,39,-1), 1])/2 + eps
+        idexp_lm3d[:,range(41,39,-1), 1] = (idexp_lm3d[:,37:39, 1] + idexp_lm3d[:,range(41,39,-1), 1])/2 - eps
+        idexp_lm3d[:,43:45, 1] = (idexp_lm3d[:,43:45, 1] + idexp_lm3d[:,range(47,45,-1), 1])/2 + eps
+        idexp_lm3d[:,range(47,45,-1), 1] = (idexp_lm3d[:,43:45, 1] + idexp_lm3d[:,range(47,45,-1), 1])/2 - eps
+        return idexp_lm3d
+if __name__ == '__main__':
+    import cv2
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    face_mesh_helper = Face3DHelper('deep_3drecon/BFM')
+    coeff_npy = 'data/coeff_fit_mp/crop_nana_003_coeff_fit_mp.npy'
+    coeff_dict = np.load(coeff_npy, allow_pickle=True).tolist()
+    lm3d = face_mesh_helper.reconstruct_lm2d(torch.tensor(coeff_dict['id']).cuda(), torch.tensor(coeff_dict['exp']).cuda(), torch.tensor(coeff_dict['euler']).cuda(), torch.tensor(coeff_dict['trans']).cuda() )
+    WH = 512
+    lm3d = (lm3d * WH).cpu().int().numpy()
+    eye_idx = list(range(36,48))
+    mouth_idx = list(range(48,68))
+    import imageio
+    debug_name = 'debug_lm3d.mp4'
+    writer = imageio.get_writer(debug_name, fps=25)
+    for i_img in range(len(lm3d)):
+        lm2d = lm3d[i_img ,:, :2] # [68, 2]
+        img = np.ones([WH, WH, 3], dtype=np.uint8) * 255
+        for i in range(len(lm2d)):
+            x, y = lm2d[i]
+            if i in eye_idx:
+                color = (0,0,255)
+            elif i in mouth_idx:
+                color = (0,255,0)
+            else:
+                color = (255,0,0)
+            img = cv2.circle(img, center=(x,y), radius=3, color=color, thickness=-1)
+            img = cv2.putText(img, f"{i}", org=(x,y), fontFace=font, fontScale=0.3, color=(255,0,0))
+        writer.append_data(img)
+    writer.close()

deep_3drecon/BFM/.gitkeep ADDED Viewed

File without changes

deep_3drecon/bfm_left_eye_faces.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9651756ea2c0fac069a1edf858ed1f125eddc358fa74c529a370c1e7b5730d28
+size 4680

deep_3drecon/bfm_right_eye_faces.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28cb5bbacf578d30a3d5006ec28c617fe5a3ecaeeeb87d9433a884e0f0301a2e
+size 4648

deep_3drecon/deep_3drecon_models/bfm.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""This script defines the parametric 3d face model for Deep3DFaceRecon_pytorch
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.io import loadmat
+import os
+# from utils.commons.tensor_utils import convert_like
+def perspective_projection(focal, center):
+    # return p.T (N, 3) @ (3, 3)
+    return np.array([
+        focal, 0, center,
+        0, focal, center,
+        0, 0, 1
+    ]).reshape([3, 3]).astype(np.float32).transpose() # 注意这里的transpose！
+class SH:
+    def __init__(self):
+        self.a = [np.pi, 2 * np.pi / np.sqrt(3.), 2 * np.pi / np.sqrt(8.)]
+        self.c = [1/np.sqrt(4 * np.pi), np.sqrt(3.) / np.sqrt(4 * np.pi), 3 * np.sqrt(5.) / np.sqrt(12 * np.pi)]
+class ParametricFaceModel:
+    def __init__(self,
+                bfm_folder='./BFM',
+                recenter=True,
+                camera_distance=10.,
+                init_lit=np.array([
+                    0.8, 0, 0, 0, 0, 0, 0, 0, 0
+                    ]),
+                focal=1015.,
+                center=112.,
+                is_train=True,
+                default_name='BFM_model_front.mat',
+                keypoint_mode='mediapipe'):
+        model = loadmat(os.path.join(bfm_folder, default_name))
+        # mean face shape. [3*N,1]
+        self.mean_shape = model['meanshape'].astype(np.float32)
+        # identity basis. [3*N,80]
+        self.id_base = model['idBase'].astype(np.float32)
+        # expression basis. [3*N,64]
+        self.exp_base = model['exBase'].astype(np.float32)
+        # mean face texture. [3*N,1] (0-255)
+        self.mean_tex = model['meantex'].astype(np.float32)
+        # texture basis. [3*N,80]
+        self.tex_base = model['texBase'].astype(np.float32)
+        # face indices for each vertex that lies in. starts from 0. [N,8]
+        self.point_buf = model['point_buf'].astype(np.int64) - 1
+        # vertex indices for each face. starts from 0. [F,3]
+        self.face_buf = model['tri'].astype(np.int64) - 1
+        # vertex indices for 68 landmarks. starts from 0. [68,1]
+        if keypoint_mode == 'mediapipe':
+            self.keypoints = np.load("deep_3drecon/BFM/index_mp468_from_mesh35709.npy").astype(np.int64)
+            unmatch_mask = self.keypoints < 0
+            self.keypoints[unmatch_mask] = 0
+        else:
+            self.keypoints = np.squeeze(model['keypoints']).astype(np.int64) - 1
+        if is_train:
+            # vertex indices for small face region to compute photometric error. starts from 0.
+            self.front_mask = np.squeeze(model['frontmask2_idx']).astype(np.int64) - 1
+            # vertex indices for each face from small face region. starts from 0. [f,3]
+            self.front_face_buf = model['tri_mask2'].astype(np.int64) - 1
+            # vertex indices for pre-defined skin region to compute reflectance loss
+            self.skin_mask = np.squeeze(model['skinmask'])
+        if recenter:
+            mean_shape = self.mean_shape.reshape([-1, 3])
+            mean_shape = mean_shape - np.mean(mean_shape, axis=0, keepdims=True)
+            self.mean_shape = mean_shape.reshape([-1, 1])
+        self.key_mean_shape = self.mean_shape.reshape([-1, 3])[self.keypoints, :].reshape([-1, 3])
+        self.key_id_base = self.id_base.reshape([-1, 3,80])[self.keypoints, :].reshape([-1, 80])
+        self.key_exp_base = self.exp_base.reshape([-1, 3, 64])[self.keypoints, :].reshape([-1, 64])
+        self.focal = focal
+        self.center = center
+        self.persc_proj = perspective_projection(focal, center)
+        self.device = 'cpu'
+        self.camera_distance = camera_distance
+        self.SH = SH()
+        self.init_lit = init_lit.reshape([1, 1, -1]).astype(np.float32)
+        self.initialized = False
+    def to(self, device):
+        self.device = device
+        for key, value in self.__dict__.items():
+            if type(value).__module__ == np.__name__:
+                setattr(self, key, torch.tensor(value).to(device))
+        self.initialized = True
+        return self
+    def compute_shape(self, id_coeff, exp_coeff):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        Parameters:
+            id_coeff         -- torch.tensor, size (B, 80), identity coeffs
+            exp_coeff        -- torch.tensor, size (B, 64), expression coeffs
+        """
+        batch_size = id_coeff.shape[0]
+        id_part = torch.einsum('ij,aj->ai', self.id_base, id_coeff)
+        exp_part = torch.einsum('ij,aj->ai', self.exp_base, exp_coeff)
+        face_shape = id_part + exp_part + self.mean_shape.reshape([1, -1])
+        return face_shape.reshape([batch_size, -1, 3])
+    def compute_key_shape(self, id_coeff, exp_coeff):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        Parameters:
+            id_coeff         -- torch.tensor, size (B, 80), identity coeffs
+            exp_coeff        -- torch.tensor, size (B, 64), expression coeffs
+        """
+        batch_size = id_coeff.shape[0]
+        id_part = torch.einsum('ij,aj->ai', self.key_id_base, id_coeff)
+        exp_part = torch.einsum('ij,aj->ai', self.key_exp_base, exp_coeff)
+        face_shape = id_part + exp_part + self.key_mean_shape.reshape([1, -1])
+        return face_shape.reshape([batch_size, -1, 3])
+    def compute_texture(self, tex_coeff, normalize=True):
+        """
+        Return:
+            face_texture     -- torch.tensor, size (B, N, 3), in RGB order, range (0, 1.)
+        Parameters:
+            tex_coeff        -- torch.tensor, size (B, 80)
+        """
+        batch_size = tex_coeff.shape[0]
+        face_texture = torch.einsum('ij,aj->ai', self.tex_base, tex_coeff) + self.mean_tex
+        if normalize:
+            face_texture = face_texture / 255.
+        return face_texture.reshape([batch_size, -1, 3])
+    def compute_norm(self, face_shape):
+        """
+        Return:
+            vertex_norm      -- torch.tensor, size (B, N, 3)
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+        v1 = face_shape[:, self.face_buf[:, 0]]
+        v2 = face_shape[:, self.face_buf[:, 1]]
+        v3 = face_shape[:, self.face_buf[:, 2]]
+        e1 = v1 - v2
+        e2 = v2 - v3
+        face_norm = torch.cross(e1, e2, dim=-1)
+        face_norm = F.normalize(face_norm, dim=-1, p=2)
+        face_norm = torch.cat([face_norm, torch.zeros(face_norm.shape[0], 1, 3).to(self.device)], dim=1)
+        vertex_norm = torch.sum(face_norm[:, self.point_buf], dim=2)
+        vertex_norm = F.normalize(vertex_norm, dim=-1, p=2)
+        return vertex_norm
+    def compute_color(self, face_texture, face_norm, gamma):
+        """
+        Return:
+            face_color       -- torch.tensor, size (B, N, 3), range (0, 1.)
+        Parameters:
+            face_texture     -- torch.tensor, size (B, N, 3), from texture model, range (0, 1.)
+            face_norm        -- torch.tensor, size (B, N, 3), rotated face normal
+            gamma            -- torch.tensor, size (B, 27), SH coeffs
+        """
+        batch_size = gamma.shape[0]
+        v_num = face_texture.shape[1]
+        a, c = self.SH.a, self.SH.c
+        gamma = gamma.reshape([batch_size, 3, 9])
+        gamma = gamma + self.init_lit
+        gamma = gamma.permute(0, 2, 1)
+        Y = torch.cat([
+             a[0] * c[0] * torch.ones_like(face_norm[..., :1]).to(self.device),
+            -a[1] * c[1] * face_norm[..., 1:2],
+             a[1] * c[1] * face_norm[..., 2:],
+            -a[1] * c[1] * face_norm[..., :1],
+             a[2] * c[2] * face_norm[..., :1] * face_norm[..., 1:2],
+            -a[2] * c[2] * face_norm[..., 1:2] * face_norm[..., 2:],
+            0.5 * a[2] * c[2] / np.sqrt(3.) * (3 * face_norm[..., 2:] ** 2 - 1),
+            -a[2] * c[2] * face_norm[..., :1] * face_norm[..., 2:],
+            0.5 * a[2] * c[2] * (face_norm[..., :1] ** 2  - face_norm[..., 1:2] ** 2)
+        ], dim=-1)
+        r = Y @ gamma[..., :1]
+        g = Y @ gamma[..., 1:2]
+        b = Y @ gamma[..., 2:]
+        face_color = torch.cat([r, g, b], dim=-1) * face_texture
+        return face_color
+    @staticmethod
+    def compute_rotation(angles, device='cpu'):
+        """
+        Return:
+            rot              -- torch.tensor, size (B, 3, 3) pts @ trans_mat
+        Parameters:
+            angles           -- torch.tensor, size (B, 3), radian
+        """
+        batch_size = angles.shape[0]
+        angles = angles.to(device)
+        ones = torch.ones([batch_size, 1]).to(device)
+        zeros = torch.zeros([batch_size, 1]).to(device)
+        x, y, z = angles[:, :1], angles[:, 1:2], angles[:, 2:],
+        rot_x = torch.cat([
+            ones, zeros, zeros,
+            zeros, torch.cos(x), -torch.sin(x),
+            zeros, torch.sin(x), torch.cos(x)
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot_y = torch.cat([
+            torch.cos(y), zeros, torch.sin(y),
+            zeros, ones, zeros,
+            -torch.sin(y), zeros, torch.cos(y)
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot_z = torch.cat([
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z), torch.cos(z), zeros,
+            zeros, zeros, ones
+        ], dim=1).reshape([batch_size, 3, 3])
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)
+    def to_camera(self, face_shape):
+        face_shape[..., -1] = self.camera_distance - face_shape[..., -1] # reverse the depth axis, add a fixed offset of length
+        return face_shape
+    def to_image(self, face_shape):
+        """
+        Return:
+            face_proj        -- torch.tensor, size (B, N, 2), y direction is opposite to v direction
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+        # to image_plane
+        face_proj = face_shape @ self.persc_proj
+        face_proj = face_proj[..., :2] / face_proj[..., 2:]
+        return face_proj
+    def transform(self, face_shape, rot, trans):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3) pts @ rot + trans
+        Parameters:
+            face_shape       -- torch.tensor, si≥ze (B, N, 3)
+            rot              -- torch.tensor, size (B, 3, 3)
+            trans            -- torch.tensor, size (B, 3)
+        """
+        return face_shape @ rot + trans.unsqueeze(1)
+    def get_landmarks(self, face_proj):
+        """
+        Return:
+            face_lms         -- torch.tensor, size (B, 68, 2)
+        Parameters:
+            face_proj       -- torch.tensor, size (B, N, 2)
+        """
+        return face_proj[:, self.keypoints]
+    def split_coeff(self, coeffs):
+        """
+        Return:
+            coeffs_dict     -- a dict of torch.tensors
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 256)
+        """
+        id_coeffs = coeffs[:, :80]
+        exp_coeffs = coeffs[:, 80: 144]
+        tex_coeffs = coeffs[:, 144: 224]
+        angles = coeffs[:, 224: 227]
+        gammas = coeffs[:, 227: 254]
+        translations = coeffs[:, 254:]
+        return {
+            'id': id_coeffs,
+            'exp': exp_coeffs,
+            'tex': tex_coeffs,
+            'angle': angles,
+            'gamma': gammas,
+            'trans': translations
+        }
+    def compute_for_render(self, coeffs):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        coef_dict = self.split_coeff(coeffs)
+        face_shape = self.compute_shape(coef_dict['id'], coef_dict['exp'])
+        rotation = self.compute_rotation(coef_dict['angle'], device=self.device)
+        face_shape_transformed = self.transform(face_shape, rotation, coef_dict['trans'])
+        face_vertex = self.to_camera(face_shape_transformed)
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+        face_texture = self.compute_texture(coef_dict['tex'])
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted, coef_dict['gamma'])
+        return face_vertex, face_texture, face_color, landmark
+    def compute_face_vertex(self, id, exp, angle, trans):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        if not self.initialized:
+            self.to(id.device)
+        face_shape = self.compute_shape(id, exp)
+        rotation = self.compute_rotation(angle, device=self.device)
+        face_shape_transformed = self.transform(face_shape, rotation, trans)
+        face_vertex = self.to_camera(face_shape_transformed)
+        return face_vertex
+    def compute_for_landmark_fit(self, id, exp, angles, trans, ret=None):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        face_shape = self.compute_key_shape(id, exp)
+        rotation = self.compute_rotation(angles, device=self.device)
+        face_shape_transformed = self.transform(face_shape, rotation, trans)
+        face_vertex = self.to_camera(face_shape_transformed)
+        face_proj = self.to_image(face_vertex)
+        landmark = face_proj
+        return landmark
+    def compute_for_landmark_fit_nerf(self, id, exp, angles, trans, ret=None):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        face_shape = self.compute_key_shape(id, exp)
+        rotation = self.compute_rotation(angles, device=self.device)
+        face_shape_transformed = self.transform(face_shape, rotation, trans)
+        face_vertex = face_shape_transformed # no to_camera
+        face_proj = self.to_image(face_vertex)
+        landmark = face_proj
+        return landmark
+    # def compute_for_landmark_fit(self, id, exp, angles, trans, ret={}):
+    #     """
+    #     Return:
+    #         face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+    #         face_color      -- torch.tensor, size (B, N, 3), in RGB order
+    #         landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+    #     Parameters:
+    #         coeffs          -- torch.tensor, size (B, 257)
+    #     """
+    #     face_shape = self.compute_shape(id, exp)
+    #     rotation = self.compute_rotation(angles)
+    #     face_shape_transformed = self.transform(face_shape, rotation, trans)
+    #     face_vertex = self.to_camera(face_shape_transformed)
+    #     face_proj = self.to_image(face_vertex)
+    #     landmark = self.get_landmarks(face_proj)
+    #     return landmark
+    def compute_for_render_fit(self, id, exp, angles, trans, tex, gamma):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        face_shape = self.compute_shape(id, exp)
+        rotation = self.compute_rotation(angles, device=self.device)
+        face_shape_transformed = self.transform(face_shape, rotation, trans)
+        face_vertex = self.to_camera(face_shape_transformed)
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+        face_texture = self.compute_texture(tex)
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted, gamma)
+        return face_color, face_vertex, landmark

deep_3drecon/ncc_code.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da54a620c0981d43cc9f30b3d8b3f5d4beb0ec0e27127a1ef3fb62ea50913609
+size 428636

deep_3drecon/secc_renderer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+from deep_3drecon.util.mesh_renderer import MeshRenderer
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+class SECC_Renderer(nn.Module):
+    def __init__(self, rasterize_size=None, device="cuda"):
+        super().__init__()
+        self.face_model = ParametricFaceModel('deep_3drecon/BFM')
+        self.fov = 2 * np.arctan(self.face_model.center / self.face_model.focal) * 180 / np.pi
+        self.znear = 5.
+        self.zfar = 15.
+        if rasterize_size is None:
+            rasterize_size = 2*self.face_model.center
+        self.face_renderer = MeshRenderer(rasterize_fov=self.fov, znear=self.znear, zfar=self.zfar, rasterize_size=rasterize_size, use_opengl=False).cuda()
+        face_feat = np.load("deep_3drecon/ncc_code.npy", allow_pickle=True)
+        self.face_feat = torch.tensor(face_feat.T).unsqueeze(0).to(device=device)
+        del_index_re = np.load('deep_3drecon/bfm_right_eye_faces.npy')
+        del_index_re = del_index_re - 1
+        del_index_le = np.load('deep_3drecon/bfm_left_eye_faces.npy')
+        del_index_le = del_index_le - 1
+        face_buf_list = []
+        for i in range(self.face_model.face_buf.shape[0]):
+            if i not in del_index_re and i not in del_index_le:
+                face_buf_list.append(self.face_model.face_buf[i])
+        face_buf_arr = np.array(face_buf_list)
+        self.face_buf = torch.tensor(face_buf_arr).to(device=device)
+    def forward(self, id, exp, euler, trans):
+        """
+        id, exp, euler, euler: [B, C] or [B, T, C]
+        return:
+            MASK: [B, 1, 512, 512], value[0. or 1.0], 1.0 denotes is face
+            SECC MAP: [B, 3, 512, 512], value[0~1]
+            if input is BTC format, return [B, C, T, H, W]
+        """
+        bs = id.shape[0]
+        is_btc_flag = id.ndim == 3
+        if is_btc_flag:
+            t = id.shape[1]
+            bs = bs * t
+            id, exp, euler, trans = id.reshape([bs,-1]), exp.reshape([bs,-1]), euler.reshape([bs,-1]), trans.reshape([bs,-1])
+        face_vertex = self.face_model.compute_face_vertex(id, exp, euler, trans)
+        face_mask, _, secc_face = self.face_renderer(
+                face_vertex, self.face_buf.unsqueeze(0).repeat([bs, 1, 1]), feat=self.face_feat.repeat([bs,1,1]))
+        secc_face = (secc_face - 0.5) / 0.5 # scale to -1~1
+        if is_btc_flag:
+            bs = bs // t
+            face_mask = rearrange(face_mask, "(n t) c h w -> n c t h w", n=bs, t=t)
+            secc_face = rearrange(secc_face, "(n t) c h w -> n c t h w", n=bs, t=t)
+        return face_mask, secc_face
+if __name__ == '__main__':
+    import imageio
+    renderer = SECC_Renderer(rasterize_size=512)
+    ret = np.load("data/processed/videos/May/vid_coeff_fit.npy", allow_pickle=True).tolist()
+    idx = 6
+    id = torch.tensor(ret['id']).cuda()[idx:idx+1]
+    exp = torch.tensor(ret['exp']).cuda()[idx:idx+1]
+    angle = torch.tensor(ret['euler']).cuda()[idx:idx+1]
+    trans = torch.tensor(ret['trans']).cuda()[idx:idx+1]
+    mask, secc = renderer(id, exp, angle*0, trans*0) # [1, 1, 512, 512], [1, 3, 512, 512]
+    out_mask = mask[0].permute(1,2,0)
+    out_mask = (out_mask * 127.5 + 127.5).int().cpu().numpy()
+    imageio.imwrite("out_mask.png", out_mask)
+    out_img = secc[0].permute(1,2,0)
+    out_img = (out_img * 127.5 + 127.5).int().cpu().numpy()
+    imageio.imwrite("out_secc.png", out_img)

deep_3drecon/util/mesh_renderer.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""This script is the differentiable renderer for Deep3DFaceRecon_pytorch
+    Attention, antialiasing step is missing in current version.
+"""
+import torch
+import torch.nn.functional as F
+import kornia
+from kornia.geometry.camera import pixel2cam
+import numpy as np
+from typing import List
+from scipy.io import loadmat
+from torch import nn
+import traceback
+try:
+    import pytorch3d.ops
+    from pytorch3d.structures import Meshes
+    from pytorch3d.renderer import (
+        look_at_view_transform,
+        FoVPerspectiveCameras,
+        DirectionalLights,
+        RasterizationSettings,
+        MeshRenderer,
+        MeshRasterizer,
+        SoftPhongShader,
+        TexturesUV,
+    )
+except:
+    traceback.print_exc()
+# def ndc_projection(x=0.1, n=1.0, f=50.0):
+#     return np.array([[n/x,    0,            0,              0],
+#                      [  0, n/-x,            0,              0],
+#                      [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)],
+#                      [  0,    0,           -1,              0]]).astype(np.float32)
+class MeshRenderer(nn.Module):
+    def __init__(self,
+                rasterize_fov,
+                znear=0.1,
+                zfar=10,
+                rasterize_size=224,**args):
+        super(MeshRenderer, self).__init__()
+        # x = np.tan(np.deg2rad(rasterize_fov * 0.5)) * znear
+        # self.ndc_proj = torch.tensor(ndc_projection(x=x, n=znear, f=zfar)).matmul(
+        #         torch.diag(torch.tensor([1., -1, -1, 1])))
+        self.rasterize_size = rasterize_size
+        self.fov = rasterize_fov
+        self.znear = znear
+        self.zfar = zfar
+        self.rasterizer = None
+    def forward(self, vertex, tri, feat=None):
+        """
+        Return:
+            mask               -- torch.tensor, size (B, 1, H, W)
+            depth              -- torch.tensor, size (B, 1, H, W)
+            features(optional) -- torch.tensor, size (B, C, H, W) if feat is not None
+        Parameters:
+            vertex          -- torch.tensor, size (B, N, 3)
+            tri             -- torch.tensor, size (B, M, 3) or (M, 3), triangles
+            feat(optional)  -- torch.tensor, size (B, N ,C), features
+        """
+        device = vertex.device
+        rsize = int(self.rasterize_size)
+        # ndc_proj = self.ndc_proj.to(device)
+        # trans to homogeneous coordinates of 3d vertices, the direction of y is the same as v
+        if vertex.shape[-1] == 3:
+            vertex = torch.cat([vertex, torch.ones([*vertex.shape[:2], 1]).to(device)], dim=-1)
+            vertex[..., 0] = -vertex[..., 0]
+        # vertex_ndc = vertex @ ndc_proj.t()
+        if self.rasterizer is None:
+            self.rasterizer = MeshRasterizer()
+            print("create rasterizer on device cuda:%d"%device.index)
+        # ranges = None
+        # if isinstance(tri, List) or len(tri.shape) == 3:
+        #     vum = vertex_ndc.shape[1]
+        #     fnum = torch.tensor([f.shape[0] for f in tri]).unsqueeze(1).to(device)
+        #     fstartidx = torch.cumsum(fnum, dim=0) - fnum
+        #     ranges = torch.cat([fstartidx, fnum], axis=1).type(torch.int32).cpu()
+        #     for i in range(tri.shape[0]):
+        #         tri[i] = tri[i] + i*vum
+        #     vertex_ndc = torch.cat(vertex_ndc, dim=0)
+        #     tri = torch.cat(tri, dim=0)
+        # for range_mode vetex: [B*N, 4], tri: [B*M, 3], for instance_mode vetex: [B, N, 4], tri: [M, 3]
+        tri = tri.type(torch.int32).contiguous()
+        # rasterize
+        cameras = FoVPerspectiveCameras(
+            device=device,
+            fov=self.fov,
+            znear=self.znear,
+            zfar=self.zfar,
+        )
+        raster_settings = RasterizationSettings(
+            image_size=rsize
+        )
+        # print(vertex.shape, tri.shape)
+        if tri.ndim == 2:
+            tri = tri.unsqueeze(0)
+        mesh = Meshes(vertex.contiguous()[...,:3], tri)
+        fragments = self.rasterizer(mesh, cameras = cameras, raster_settings = raster_settings)
+        rast_out = fragments.pix_to_face.squeeze(-1)
+        depth = fragments.zbuf
+        # render depth
+        depth = depth.permute(0, 3, 1, 2)
+        mask = (rast_out > 0).float().unsqueeze(1)
+        depth = mask * depth
+        image = None
+        if feat is not None:
+            attributes = feat.reshape(-1,3)[mesh.faces_packed()]
+            image = pytorch3d.ops.interpolate_face_attributes(fragments.pix_to_face,
+                                                      fragments.bary_coords,
+                                                      attributes)
+            # print(image.shape)
+            image = image.squeeze(-2).permute(0, 3, 1, 2)
+            image = mask * image
+        return mask, depth, image

docs/prepare_env/install_guide-zh.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# 环境配置
+[English Doc](./install_guide.md)
+本文档陈述了搭建Real3D-Portrait Python环境的步骤，我们使用了Conda来管理依赖。
+以下配置已在 A100/V100 + CUDA11.7 中进行了验证。
+# 1. 安装CUDA
+我们推荐安装CUDA `11.7`，其他CUDA版本（例如`10.2`、`12.x`）也可能有效。
+# 2. 安装Python依赖
+```
+cd <Real3DPortraitRoot>
+source <CondaRoot>/bin/activate
+conda create -n real3dportrait python=3.9
+conda activate real3dportrait
+conda install conda-forge::ffmpeg # ffmpeg with libx264 codec to turn images to video
+# 我们推荐安装torch2.0.1+cuda11.7.
+conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia
+# 从源代码安装，需要比较长的时间 (如果遇到各种time-out问题，建议使用代理)
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+# MMCV安装
+pip install cython
+pip install openmim==0.3.9
+mim install mmcv==2.1.0 # 使用mim来加速mmcv安装
+# 其他依赖项
+pip install -r docs/prepare_env/requirements.txt -v
+```

docs/prepare_env/install_guide.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# Prepare the Environment
+[中文文档](./install_guide-zh.md)
+This guide is about building a python environment for Real3D-Portrait with Conda.
+The following installation process is verified in A100/V100 + CUDA11.7.
+# 1. Install CUDA
+ We recommend to install CUDA `11.7` (which is verified in various types of GPUs), but other CUDA versions (such as `10.2`, `12.x`) may also work well.
+# 2. Install Python Packages
+```
+cd <Real3DPortraitRoot>
+source <CondaRoot>/bin/activate
+conda create -n real3dportrait python=3.9
+conda activate real3dportrait
+conda install conda-forge::ffmpeg # ffmpeg with libx264 codec to turn images to video
+### We recommend torch2.0.1+cuda11.7.
+conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia
+# Build from source, it may take a long time (Proxy is recommended if encountering the time-out problem)
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+# MMCV for some network structure
+pip install cython
+pip install openmim==0.3.9
+mim install mmcv==2.1.0 # use mim to speed up installation for mmcv
+# other dependencies
+pip install -r docs/prepare_env/requirements.txt -v
+```

docs/prepare_env/requirements.txt ADDED Viewed

	@@ -0,0 +1,75 @@

+Cython
+numpy # ==1.23.0
+numba==0.56.4
+pandas
+transformers
+scipy==1.11.1 # required by cal_fid. https://github.com/mseitzer/pytorch-fid/issues/103
+scikit-learn
+scikit-image
+# tensorflow # you can flexible it, this is gpu version
+tensorboard
+tensorboardX
+python_speech_features
+resampy
+opencv_python
+face_alignment
+matplotlib
+configargparse
+librosa==0.9.2
+praat-parselmouth # ==0.4.3
+trimesh
+kornia==0.5.0
+PyMCubes
+lpips
+setuptools # ==59.5.0
+ffmpeg-python
+moviepy
+dearpygui
+ninja
+# pyaudio # for extract esperanto
+mediapipe
+protobuf
+decord
+soundfile
+pillow
+# torch # it's better to install torch with conda
+av
+timm
+pretrainedmodels
+faiss-cpu # for fast nearest camera pose retriveal
+einops
+# mmcv # use mim install is faster
+# conditional flow matching
+beartype
+torchode
+torchdiffeq
+# tts
+cython
+textgrid
+pyloudnorm
+websocket-client
+pyworld==0.2.1rc0
+pypinyin==0.42.0
+webrtcvad
+torchshow
+# cal spk sim
+s3prl
+fire
+# cal LMD
+dlib
+# debug
+ipykernel
+# lama
+hydra-core
+pytorch_lightning
+setproctitle
+# Gradio GUI
+httpx==0.23.3
+gradio==4.16.0

inference/app_real3dportrait.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os, sys
+import argparse
+import gradio as gr
+from inference.real3d_infer import GeneFace2Infer
+from utils.commons.hparams import hparams
+class Inferer(GeneFace2Infer):
+    def infer_once_args(self, *args, **kargs):
+        assert len(kargs) == 0
+        keys = [
+            'src_image_name',
+            'drv_audio_name',
+            'drv_pose_name',
+            'bg_image_name',
+            'blink_mode',
+            'temperature',
+            'mouth_amp',
+            'out_mode',
+            'map_to_init_pose',
+            'hold_eye_opened',
+            'head_torso_threshold',
+            'a2m_ckpt',
+            'head_ckpt',
+            'torso_ckpt',
+        ]
+        inp = {}
+        out_name = None
+        info = ""
+        try: # try to catch errors and jump to return
+            for key_index in range(len(keys)):
+                key = keys[key_index]
+                inp[key] = args[key_index]
+                if '_name' in key:
+                    inp[key] = inp[key] if inp[key] is not None else ''
+            if inp['src_image_name'] == '':
+                info = "Input Error: Source image is REQUIRED!"
+                raise ValueError
+            if inp['drv_audio_name'] == '' and inp['drv_pose_name'] == '':
+                info = "Input Error: At least one of driving audio or video is REQUIRED!"
+                raise ValueError
+            if inp['drv_audio_name'] == '' and inp['drv_pose_name'] != '':
+                inp['drv_audio_name'] = inp['drv_pose_name']
+                print("No audio input, we use driving pose video for video driving")
+            if inp['drv_pose_name'] == '':
+                inp['drv_pose_name'] = 'static'
+            reload_flag = False
+            if inp['a2m_ckpt'] != self.audio2secc_dir:
+                print("Changes of a2m_ckpt detected, reloading model")
+                reload_flag = True
+            if inp['head_ckpt'] != self.head_model_dir:
+                print("Changes of head_ckpt detected, reloading model")
+                reload_flag = True
+            if inp['torso_ckpt'] != self.torso_model_dir:
+                print("Changes of torso_ckpt detected, reloading model")
+                reload_flag = True
+            inp['out_name'] = ''
+            inp['seed'] = 42
+            print(f"infer inputs : {inp}")
+            if self.secc2video_hparams['htbsr_head_threshold'] != inp['head_torso_threshold']:
+                print("Changes of head_torso_threshold detected, reloading model")
+                reload_flag = True
+            try:
+                if reload_flag:
+                    self.__init__(inp['a2m_ckpt'], inp['head_ckpt'], inp['torso_ckpt'], inp=inp, device=self.device)
+            except Exception as e:
+                content = f"{e}"
+                info = f"Reload ERROR: {content}"
+                raise ValueError
+            try:
+                out_name = self.infer_once(inp)
+            except Exception as e:
+                content = f"{e}"
+                info = f"Inference ERROR: {content}"
+                raise ValueError
+        except Exception as e:
+            if info == "": # unexpected errors
+                content = f"{e}"
+                info = f"WebUI ERROR: {content}"
+        # output part
+        if len(info) > 0 : # there is errors
+            print(info)
+            info_gr = gr.update(visible=True, value=info)
+        else: # no errors
+            info_gr = gr.update(visible=False, value=info)
+        if out_name is not None and len(out_name) > 0 and os.path.exists(out_name): # good output
+            print(f"Succefully generated in {out_name}")
+            video_gr = gr.update(visible=True, value=out_name)
+        else:
+            print(f"Failed to generate")
+            video_gr = gr.update(visible=True, value=out_name)
+        return video_gr, info_gr
+def toggle_audio_file(choice):
+    if choice == False:
+        return gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True)
+def ref_video_fn(path_of_ref_video):
+    if path_of_ref_video is not None:
+        return gr.update(value=True)
+    else:
+        return gr.update(value=False)
+def real3dportrait_demo(
+    audio2secc_dir,
+    head_model_dir,
+    torso_model_dir,
+    device          = 'cuda',
+    warpfn          = None,
+    ):
+    sep_line = "-" * 40
+    infer_obj = Inferer(
+        audio2secc_dir=audio2secc_dir,
+        head_model_dir=head_model_dir,
+        torso_model_dir=torso_model_dir,
+        device=device,
+    )
+    print(sep_line)
+    print("Model loading is finished.")
+    print(sep_line)
+    with gr.Blocks(analytics_enabled=False) as real3dportrait_interface:
+        gr.Markdown("\
+            <div align='center'> <h2> Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis (ICLR 2024 Spotlight) </span> </h2> \
+            <a style='font-size:18px;color: #a0a0a0' href='https://arxiv.org/pdf/2401.08503.pdf'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
+            <a style='font-size:18px;color: #a0a0a0' href='https://real3dportrait.github.io/'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
+            <a style='font-size:18px;color: #a0a0a0' href='https://baidu.com'> Github </div>")
+        sources = None
+        with gr.Row():
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="source_image"):
+                    with gr.TabItem('Upload image'):
+                        with gr.Row():
+                            src_image_name = gr.Image(label="Source image (required)", sources=sources, type="filepath", value="data/raw/examples/Macron.png")
+                with gr.Tabs(elem_id="driven_audio"):
+                    with gr.TabItem('Upload audio'):
+                        with gr.Column(variant='panel'):
+                            drv_audio_name = gr.Audio(label="Input audio (required for audio-driven)", sources=sources, type="filepath", value="data/raw/examples/Obama_5s.wav")
+                with gr.Tabs(elem_id="driven_pose"):
+                    with gr.TabItem('Upload video'):
+                        with gr.Column(variant='panel'):
+                            drv_pose_name = gr.Video(label="Driven Pose (required for video-driven, optional for audio-driven)", sources=sources, value="data/raw/examples/May_5s.mp4")
+                with gr.Tabs(elem_id="bg_image"):
+                    with gr.TabItem('Upload image'):
+                        with gr.Row():
+                            bg_image_name = gr.Image(label="Background image (optional)", sources=sources, type="filepath", value="data/raw/examples/bg.png")
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="checkbox"):
+                    with gr.TabItem('General Settings'):
+                        with gr.Column(variant='panel'):
+                            blink_mode = gr.Radio(['none', 'period'], value='period', label='blink mode', info="whether to blink periodly") #
+                            temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.025, label="temperature",  value=0.2, info='audio to secc temperature',)
+                            mouth_amp = gr.Slider(minimum=0.0, maximum=1.0, step=0.025, label="mouth amplitude",  value=0.45, info='higher -> mouth will open wider, default to be 0.4',)
+                            out_mode = gr.Radio(['final', 'concat_debug'], value='final', label='output layout', info="final: only final output ; concat_debug: final output concated with internel features")
+                            map_to_init_pose = gr.Checkbox(label="Whether to map pose of first frame to initial pose")
+                            hold_eye_opened  = gr.Checkbox(label="Whether to maintain eyes always open")
+                            head_torso_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.025, label="head torso threshold",  value=0.7, info='make it higher if you find ghosting around hair of output, default to be 0.7',)
+                            submit = gr.Button('Generate', elem_id="generate", variant='primary')
+                    with gr.Tabs(elem_id="genearted_video"):
+                        info_box = gr.Textbox(label="Error", interactive=False, visible=False)
+                        gen_video = gr.Video(label="Generated video", format="mp4", visible=True)
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="checkbox"):
+                    with gr.TabItem('Checkpoints'):
+                        with gr.Column(variant='panel'):
+                            ckpt_info_box = gr.Textbox(value="Please select \"ckpt\" under the checkpoint folder ", interactive=False, visible=True, show_label=False)
+                            audio2secc_dir = gr.FileExplorer(glob="checkpoints/**/*.ckpt", value=audio2secc_dir, file_count='single', label='audio2secc model ckpt path or directory')
+                            head_model_dir = gr.FileExplorer(glob="checkpoints/**/*.ckpt", value=head_model_dir, file_count='single', label='head model ckpt path or directory (will be ignored if torso model is set)')
+                            torso_model_dir = gr.FileExplorer(glob="checkpoints/**/*.ckpt", value=torso_model_dir, file_count='single', label='torso model ckpt path or directory')
+                            # audio2secc_dir = gr.Textbox(audio2secc_dir, max_lines=1, label='audio2secc model ckpt path or directory (will be ignored if torso model is set)')
+                            # head_model_dir = gr.Textbox(head_model_dir, max_lines=1, label='head model ckpt path or directory (will be ignored if torso model is set)')
+                            # torso_model_dir = gr.Textbox(torso_model_dir, max_lines=1, label='torso model ckpt path or directory')
+        fn = infer_obj.infer_once_args
+        if warpfn:
+            fn = warpfn(fn)
+        submit.click(
+                    fn=fn,
+                    inputs=[
+                        src_image_name,
+                        drv_audio_name,
+                        drv_pose_name,
+                        bg_image_name,
+                        blink_mode,
+                        temperature,
+                        mouth_amp,
+                        out_mode,
+                        map_to_init_pose,
+                        hold_eye_opened,
+                        head_torso_threshold,
+                        audio2secc_dir,
+                        head_model_dir,
+                        torso_model_dir,
+                    ],
+                    outputs=[
+                        gen_video,
+                        info_box,
+                    ],
+                    )
+    print(sep_line)
+    print("Gradio page is constructed.")
+    print(sep_line)
+    return real3dportrait_interface
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--a2m_ckpt", type=str, default='checkpoints/240126_real3dportrait_orig/audio2secc_vae/model_ckpt_steps_400000.ckpt')
+    parser.add_argument("--head_ckpt", type=str, default='')
+    parser.add_argument("--torso_ckpt", type=str, default='checkpoints/240126_real3dportrait_orig/secc2plane_torso_orig/model_ckpt_steps_100000.ckpt')
+    parser.add_argument("--port", type=int, default=None)
+    args = parser.parse_args()
+    demo = real3dportrait_demo(
+        audio2secc_dir=args.a2m_ckpt,
+        head_model_dir=args.head_ckpt,
+        torso_model_dir=args.torso_ckpt,
+        device='cuda:0',
+        warpfn=None,
+    )
+    demo.queue()
+    demo.launch(server_port=args.port)

inference/edit_secc.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import cv2
+import torch
+from utils.commons.image_utils import dilate, erode
+from sklearn.neighbors import NearestNeighbors
+import copy
+import numpy as np
+from utils.commons.meters import Timer
+def hold_eye_opened_for_secc(img):
+    img = img.permute(1,2,0).cpu().numpy()
+    img = ((img +1)/2*255).astype(np.uint)
+    face_mask = (img[...,0] != 0) & (img[...,1] != 0) & (img[...,2] != 0)
+    face_xys = np.stack(np.nonzero(face_mask)).transpose(1, 0) # [N_nonbg,2] coordinate of non-face pixels
+    h,w = face_mask.shape
+    # get face and eye mask
+    left_eye_prior_reigon = np.zeros([h,w], dtype=bool)
+    right_eye_prior_reigon = np.zeros([h,w], dtype=bool)
+    left_eye_prior_reigon[h//4:h//2, w//4:w//2] = True
+    right_eye_prior_reigon[h//4:h//2, w//2:w//4*3] = True
+    eye_prior_reigon = left_eye_prior_reigon | right_eye_prior_reigon
+    coarse_eye_mask = (~ face_mask) & eye_prior_reigon
+    coarse_eye_xys = np.stack(np.nonzero(coarse_eye_mask)).transpose(1, 0) # [N_nonbg,2] coordinate of non-face pixels
+    opened_eye_mask = cv2.imread('inference/os_avatar/opened_eye_mask.png')
+    opened_eye_mask = torch.nn.functional.interpolate(torch.tensor(opened_eye_mask).permute(2,0,1).unsqueeze(0), size=(img.shape[0], img.shape[1]), mode='nearest')[0].permute(1,2,0).sum(-1).bool().cpu() # [512,512,3]
+    coarse_opened_eye_xys = np.stack(np.nonzero(opened_eye_mask)) # [N_nonbg,2] coordinate of non-face pixels
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(coarse_eye_xys)
+    dists, _ = nbrs.kneighbors(coarse_opened_eye_xys) # [512*512, 1] distance to nearest non-bg pixel
+    # print(dists.max())
+    non_opened_eye_pixs = dists > max(dists.max()*0.75, 4) # 大于这个距离的opened eye部分会被合上
+    non_opened_eye_pixs = non_opened_eye_pixs.reshape([-1])
+    opened_eye_xys_to_erode = coarse_opened_eye_xys[non_opened_eye_pixs]
+    opened_eye_mask[opened_eye_xys_to_erode[...,0], opened_eye_xys_to_erode[...,1]] = False # shrink 将mask在face-eye边界收缩3pixel，为了平滑
+    img[opened_eye_mask] = 0
+    return torch.tensor(img.astype(np.float32) / 127.5 - 1).permute(2,0,1)
+# def hold_eye_opened_for_secc(img):
+#     img = copy.copy(img)
+#     eye_mask = cv2.imread('inference/os_avatar/opened_eye_mask.png')
+#     eye_mask = torch.nn.functional.interpolate(torch.tensor(eye_mask).permute(2,0,1).unsqueeze(0), size=(img.shape[-2], img.shape[-1]), mode='nearest')[0].bool().to(img.device) # [3,512,512]
+#     img[eye_mask] = -1
+#     return img
+def blink_eye_for_secc(img, close_eye_percent=0.5):
+    """
+    secc_img: [3,h,w], tensor, -1~1
+    """
+    img = img.permute(1,2,0).cpu().numpy()
+    img = ((img +1)/2*255).astype(np.uint)
+    assert close_eye_percent <= 1.0 and close_eye_percent >= 0.
+    if close_eye_percent == 0: return torch.tensor(img.astype(np.float32) / 127.5 - 1).permute(2,0,1)
+    img = copy.deepcopy(img)
+    face_mask = (img[...,0] != 0) & (img[...,1] != 0) & (img[...,2] != 0)
+    h,w = face_mask.shape
+    # get face and eye mask
+    left_eye_prior_reigon = np.zeros([h,w], dtype=bool)
+    right_eye_prior_reigon = np.zeros([h,w], dtype=bool)
+    left_eye_prior_reigon[h//4:h//2, w//4:w//2] = True
+    right_eye_prior_reigon[h//4:h//2, w//2:w//4*3] = True
+    eye_prior_reigon = left_eye_prior_reigon | right_eye_prior_reigon
+    coarse_eye_mask = (~ face_mask) & eye_prior_reigon
+    coarse_left_eye_mask = (~ face_mask) & left_eye_prior_reigon
+    coarse_right_eye_mask = (~ face_mask) & right_eye_prior_reigon
+    coarse_eye_xys = np.stack(np.nonzero(coarse_eye_mask)).transpose(1, 0) # [N_nonbg,2] coordinate of non-face pixels
+    min_h = coarse_eye_xys[:, 0].min()
+    max_h = coarse_eye_xys[:, 0].max()
+    coarse_left_eye_xys = np.stack(np.nonzero(coarse_left_eye_mask)).transpose(1, 0) # [N_nonbg,2] coordinate of non-face pixels
+    left_min_w = coarse_left_eye_xys[:, 1].min()
+    left_max_w = coarse_left_eye_xys[:, 1].max()
+    coarse_right_eye_xys = np.stack(np.nonzero(coarse_right_eye_mask)).transpose(1, 0) # [N_nonbg,2] coordinate of non-face pixels
+    right_min_w = coarse_right_eye_xys[:, 1].min()
+    right_max_w = coarse_right_eye_xys[:, 1].max()
+    # 尽力较少需要考虑的face_xyz,以降低KNN的损耗
+    left_eye_prior_reigon = np.zeros([h,w], dtype=bool)
+    more_room = 4 # 过小会导致一些问题
+    left_eye_prior_reigon[min_h-more_room:max_h+more_room, left_min_w-more_room:left_max_w+more_room] = True
+    right_eye_prior_reigon = np.zeros([h,w], dtype=bool)
+    right_eye_prior_reigon[min_h-more_room:max_h+more_room, right_min_w-more_room:right_max_w+more_room] = True
+    eye_prior_reigon = left_eye_prior_reigon | right_eye_prior_reigon
+    around_eye_face_mask = face_mask & eye_prior_reigon
+    face_mask = around_eye_face_mask
+    face_xys = np.stack(np.nonzero(around_eye_face_mask)).transpose(1, 0) # [N_nonbg,2] coordinate of non-face pixels
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(coarse_eye_xys)
+    dists, _ = nbrs.kneighbors(face_xys) # [512*512, 1] distance to nearest non-bg pixel
+    face_pixs = dists > 5 # 只有距离最近的eye pixel大于5的才被认为是face，过小会导致一些问题
+    face_pixs = face_pixs.reshape([-1])
+    face_xys_to_erode = face_xys[~face_pixs]
+    face_mask[face_xys_to_erode[...,0], face_xys_to_erode[...,1]] = False # shrink 将mask在face-eye边界收缩3pixel，为了平滑
+    eye_mask = (~ face_mask) & eye_prior_reigon
+    h_grid = np.mgrid[0:h, 0:w][0]
+    eye_num_pixel_along_w_axis = eye_mask.sum(axis=0)
+    eye_mask_along_w_axis = eye_num_pixel_along_w_axis != 0
+    tmp_h_grid = h_grid.copy()
+    tmp_h_grid[~eye_mask] = 0
+    eye_mean_h_coord_along_w_axis = tmp_h_grid.sum(axis=0) / np.clip(eye_num_pixel_along_w_axis, a_min=1, a_max=h)
+    tmp_h_grid = h_grid.copy()
+    tmp_h_grid[~eye_mask] = 99999
+    eye_min_h_coord_along_w_axis = tmp_h_grid.min(axis=0)
+    tmp_h_grid = h_grid.copy()
+    tmp_h_grid[~eye_mask] = -99999
+    eye_max_h_coord_along_w_axis = tmp_h_grid.max(axis=0)
+    eye_low_h_coord_along_w_axis = close_eye_percent * eye_mean_h_coord_along_w_axis + (1-close_eye_percent) * eye_min_h_coord_along_w_axis # upper eye
+    eye_high_h_coord_along_w_axis = close_eye_percent * eye_mean_h_coord_along_w_axis + (1-close_eye_percent) * eye_max_h_coord_along_w_axis # lower eye
+    tmp_h_grid = h_grid.copy()
+    tmp_h_grid[~eye_mask] = 99999
+    upper_eye_blink_mask = tmp_h_grid <= eye_low_h_coord_along_w_axis
+    tmp_h_grid = h_grid.copy()
+    tmp_h_grid[~eye_mask] = -99999
+    lower_eye_blink_mask = tmp_h_grid >= eye_high_h_coord_along_w_axis
+    eye_blink_mask = upper_eye_blink_mask | lower_eye_blink_mask
+    face_xys = np.stack(np.nonzero(around_eye_face_mask)).transpose(1, 0) # [N_nonbg,2] coordinate of non-face pixels
+    eye_blink_xys = np.stack(np.nonzero(eye_blink_mask)).transpose(1, 0) # [N_nonbg,hw] coordinate of non-face pixels
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(face_xys)
+    distances, indices = nbrs.kneighbors(eye_blink_xys)
+    bg_fg_xys = face_xys[indices[:, 0]]
+    img[eye_blink_xys[:, 0], eye_blink_xys[:, 1], :] = img[bg_fg_xys[:, 0], bg_fg_xys[:, 1], :]
+    return torch.tensor(img.astype(np.float32) / 127.5 - 1).permute(2,0,1)
+if __name__ == '__main__':
+    import imageio
+    import tqdm
+    img = cv2.imread("assets/cano_secc.png")
+    img = img / 127.5 - 1
+    img = torch.FloatTensor(img).permute(2, 0, 1)
+    fps = 25
+    writer = imageio.get_writer('demo_blink.mp4', fps=fps)
+    for i in tqdm.trange(33):
+        blink_percent = 0.03 * i
+        with Timer("Blink", True):
+            out_img = blink_eye_for_secc(img, blink_percent)
+        out_img = ((out_img.permute(1,2,0)+1)*127.5).int().numpy()
+        writer.append_data(out_img)
+    writer.close()

inference/infer_utils.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import torch
+import torch.nn.functional as F
+import librosa
+import numpy as np
+import importlib
+import tqdm
+import copy
+import cv2
+from scipy.spatial.transform import Rotation
+def load_img_to_512_hwc_array(img_name):
+    img = cv2.imread(img_name)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = cv2.resize(img, (512, 512))
+    return img
+def load_img_to_normalized_512_bchw_tensor(img_name):
+    img = load_img_to_512_hwc_array(img_name)
+    img = ((torch.tensor(img) - 127.5)/127.5).float().unsqueeze(0).permute(0, 3, 1,2) # [b,c,h,w]
+    return img
+def mirror_index(index, len_seq):
+    """
+    get mirror index when indexing a sequence and the index is larger than len_pose
+    args:
+        index: int
+        len_pose: int
+    return:
+        mirror_index: int
+    """
+    turn = index // len_seq
+    res = index % len_seq
+    if turn % 2 == 0:
+        return res # forward indexing
+    else:
+        return len_seq - res - 1 # reverse indexing
+def smooth_camera_sequence(camera, kernel_size=7):
+    """
+    smooth the camera trajectory (i.e., rotation & translation)...
+    args:
+        camera: [N, 25] or [N, 16]. np.ndarray
+        kernel_size: int
+    return:
+        smoothed_camera: [N, 25] or [N, 16]. np.ndarray
+    """
+    # poses: [N, 25], numpy array
+    N = camera.shape[0]
+    K = kernel_size // 2
+    poses = camera[:, :16].reshape([-1, 4, 4]).copy()
+    trans = poses[:, :3, 3].copy() # [N, 3]
+    rots = poses[:, :3, :3].copy() # [N, 3, 3]
+    for i in range(N):
+        start = max(0, i - K)
+        end = min(N, i + K + 1)
+        poses[i, :3, 3] = trans[start:end].mean(0)
+        try:
+            poses[i, :3, :3] = Rotation.from_matrix(rots[start:end]).mean().as_matrix()
+        except:
+            if i == 0:
+                poses[i, :3, :3] = rots[i]
+            else:
+                poses[i, :3, :3] = poses[i-1, :3, :3]
+    poses = poses.reshape([-1, 16])
+    camera[:, :16] = poses
+    return camera
+def smooth_features_xd(in_tensor, kernel_size=7):
+    """
+    smooth the feature maps
+    args:
+        in_tensor: [T, c,h,w] or [T, c1,c2,h,w]
+        kernel_size: int
+    return:
+        out_tensor: [T, c,h,w] or [T, c1,c2,h,w]
+    """
+    t = in_tensor.shape[0]
+    ndim = in_tensor.ndim
+    pad = (kernel_size- 1)//2
+    in_tensor = torch.cat([torch.flip(in_tensor[0:pad], dims=[0]), in_tensor, torch.flip(in_tensor[t-pad:t], dims=[0])], dim=0)
+    if ndim == 2: # tc
+        _,c = in_tensor.shape
+        in_tensor = in_tensor.permute(1,0).reshape([-1,1,t+2*pad]) # [c, 1, t]
+    elif ndim == 4: # tchw
+        _,c,h,w = in_tensor.shape
+        in_tensor = in_tensor.permute(1,2,3,0).reshape([-1,1,t+2*pad]) # [c, 1, t]
+    elif ndim == 5: # tcchw, like deformation
+        _,c1,c2, h,w = in_tensor.shape
+        in_tensor = in_tensor.permute(1,2,3,4,0).reshape([-1,1,t+2*pad]) # [c, 1, t]
+    else: raise NotImplementedError()
+    avg_kernel = 1 / kernel_size * torch.Tensor([1.]*kernel_size).reshape([1,1,kernel_size]).float().to(in_tensor.device) # [1, 1, kw]
+    out_tensor = F.conv1d(in_tensor, avg_kernel)
+    if ndim == 2: # tc
+        return out_tensor.reshape([c,t]).permute(1,0)
+    elif ndim == 4: # tchw
+        return out_tensor.reshape([c,h,w,t]).permute(3,0,1,2)
+    elif ndim == 5: # tcchw, like deformation
+        return out_tensor.reshape([c1,c2,h,w,t]).permute(4,0,1,2,3)
+def extract_audio_motion_from_ref_video(video_name):
+    def save_wav16k(audio_name):
+        supported_types = ('.wav', '.mp3', '.mp4', '.avi')
+        assert audio_name.endswith(supported_types), f"Now we only support {','.join(supported_types)} as audio source!"
+        wav16k_name = audio_name[:-4] + '_16k.wav'
+        extract_wav_cmd = f"ffmpeg -i {audio_name} -f wav -ar 16000 -v quiet -y {wav16k_name} -y"
+        os.system(extract_wav_cmd)
+        print(f"Extracted wav file (16khz) from {audio_name} to {wav16k_name}.")
+        return wav16k_name
+    def get_f0( wav16k_name):
+        from data_gen.process_lrs3.process_audio_mel_f0 import extract_mel_from_fname,extract_f0_from_wav_and_mel
+        wav, mel = extract_mel_from_fname(wav16k_name)
+        f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
+        f0 = f0.reshape([-1,1])
+        f0 = torch.tensor(f0)
+        return f0
+    def get_hubert(wav16k_name):
+        from data_gen.utils.process_audio.extract_hubert import get_hubert_from_16k_wav
+        hubert = get_hubert_from_16k_wav(wav16k_name).detach().numpy()
+        len_mel = hubert.shape[0]
+        x_multiply = 8
+        if len_mel % x_multiply == 0:
+            num_to_pad = 0
+        else:
+            num_to_pad = x_multiply - len_mel % x_multiply
+        hubert = np.pad(hubert, pad_width=((0,num_to_pad), (0,0)))
+        hubert = torch.tensor(hubert)
+        return hubert
+    def get_exp(video_name):
+        from data_gen.utils.process_video.fit_3dmm_landmark import fit_3dmm_for_a_video
+        drv_motion_coeff_dict = fit_3dmm_for_a_video(video_name, save=False)
+        exp = torch.tensor(drv_motion_coeff_dict['exp'])
+        return exp
+    wav16k_name = save_wav16k(video_name)
+    f0 = get_f0(wav16k_name)
+    hubert = get_hubert(wav16k_name)
+    os.system(f"rm {wav16k_name}")
+    exp = get_exp(video_name)
+    target_length = min(len(exp), len(hubert)//2, len(f0)//2)
+    exp = exp[:target_length]
+    f0 = f0[:target_length*2]
+    hubert = hubert[:target_length*2]
+    return exp.unsqueeze(0), hubert.unsqueeze(0), f0.unsqueeze(0)
+if __name__ == '__main__':
+    extract_audio_motion_from_ref_video('data/raw/videos/crop_0213.mp4')

inference/real3d_infer.py ADDED Viewed

	@@ -0,0 +1,542 @@

+import os
+import torch
+import torch.nn.functional as F
+import torchshow as ts
+import librosa
+import random
+import time
+import numpy as np
+import importlib
+import tqdm
+import copy
+import cv2
+# common utils
+from utils.commons.hparams import hparams, set_hparams
+from utils.commons.tensor_utils import move_to_cuda, convert_to_tensor
+from utils.commons.ckpt_utils import load_ckpt, get_last_checkpoint
+# 3DMM-related utils
+from deep_3drecon.deep_3drecon_models.bfm import ParametricFaceModel
+from data_util.face3d_helper import Face3DHelper
+from data_gen.utils.process_image.fit_3dmm_landmark import fit_3dmm_for_a_image
+from data_gen.utils.process_video.fit_3dmm_landmark import fit_3dmm_for_a_video
+from deep_3drecon.secc_renderer import SECC_Renderer
+from data_gen.eg3d.convert_to_eg3d_convention import get_eg3d_convention_camera_pose_intrinsic
+# Face Parsing
+from data_gen.utils.mp_feature_extractors.mp_segmenter import MediapipeSegmenter
+from data_gen.utils.process_video.extract_segment_imgs import inpaint_torso_job, extract_background
+# other inference utils
+from inference.infer_utils import mirror_index, load_img_to_512_hwc_array, load_img_to_normalized_512_bchw_tensor
+from inference.infer_utils import smooth_camera_sequence, smooth_features_xd
+from Real3DPortrait.inference.edit_secc import blink_eye_for_secc
+def read_first_frame_from_a_video(vid_name):
+    frames = []
+    cap = cv2.VideoCapture(vid_name)
+    ret, frame_bgr = cap.read()
+    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+    return frame_rgb
+def analyze_weights_img(gen_output):
+    img_raw = gen_output['image_raw']
+    mask_005_to_03 = torch.bitwise_and(gen_output['weights_img']>0.05, gen_output['weights_img']<0.3).repeat([1,3,1,1])
+    mask_005_to_05 = torch.bitwise_and(gen_output['weights_img']>0.05, gen_output['weights_img']<0.5).repeat([1,3,1,1])
+    mask_005_to_07 = torch.bitwise_and(gen_output['weights_img']>0.05, gen_output['weights_img']<0.7).repeat([1,3,1,1])
+    mask_005_to_09 = torch.bitwise_and(gen_output['weights_img']>0.05, gen_output['weights_img']<0.9).repeat([1,3,1,1])
+    mask_005_to_10 = torch.bitwise_and(gen_output['weights_img']>0.05, gen_output['weights_img']<1.0).repeat([1,3,1,1])
+    img_raw_005_to_03 = img_raw.clone()
+    img_raw_005_to_03[~mask_005_to_03] = -1
+    img_raw_005_to_05 = img_raw.clone()
+    img_raw_005_to_05[~mask_005_to_05] = -1
+    img_raw_005_to_07 = img_raw.clone()
+    img_raw_005_to_07[~mask_005_to_07] = -1
+    img_raw_005_to_09 = img_raw.clone()
+    img_raw_005_to_09[~mask_005_to_09] = -1
+    img_raw_005_to_10 = img_raw.clone()
+    img_raw_005_to_10[~mask_005_to_10] = -1
+    ts.save([img_raw_005_to_03[0], img_raw_005_to_05[0], img_raw_005_to_07[0], img_raw_005_to_09[0], img_raw_005_to_10[0]])
+class GeneFace2Infer:
+    def __init__(self, audio2secc_dir, head_model_dir, torso_model_dir, device=None, inp=None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = device
+        self.audio2secc_model = self.load_audio2secc(audio2secc_dir)
+        self.secc2video_model = self.load_secc2video(head_model_dir, torso_model_dir, inp)
+        self.audio2secc_model.to(device).eval()
+        self.secc2video_model.to(device).eval()
+        self.seg_model = MediapipeSegmenter()
+        self.secc_renderer = SECC_Renderer(512)
+        self.face3d_helper = Face3DHelper(use_gpu=True, keypoint_mode='lm68')
+        self.mp_face3d_helper = Face3DHelper(use_gpu=True, keypoint_mode='mediapipe')
+    def load_audio2secc(self, audio2secc_dir):
+        config_name = f"{audio2secc_dir}/config.yaml" if not audio2secc_dir.endswith(".ckpt") else f"{os.path.dirname(audio2secc_dir)}/config.yaml"
+        set_hparams(f"{config_name}", print_hparams=False)
+        self.audio2secc_dir = audio2secc_dir
+        self.audio2secc_hparams = copy.deepcopy(hparams)
+        from modules.audio2motion.vae import VAEModel, PitchContourVAEModel
+        if self.audio2secc_hparams['audio_type'] == 'hubert':
+            audio_in_dim = 1024
+        elif self.audio2secc_hparams['audio_type'] == 'mfcc':
+            audio_in_dim = 13
+        if 'icl' in hparams['task_cls']:
+            self.use_icl_audio2motion = True
+            model = InContextAudio2MotionModel(hparams['icl_model_type'], hparams=self.audio2secc_hparams)
+        else:
+            self.use_icl_audio2motion = False
+            if hparams.get("use_pitch", False) is True:
+                model = PitchContourVAEModel(hparams, in_out_dim=64, audio_in_dim=audio_in_dim)
+            else:
+                model = VAEModel(in_out_dim=64, audio_in_dim=audio_in_dim)
+        load_ckpt(model, f"{audio2secc_dir}", model_name='model', strict=True)
+        return model
+    def load_secc2video(self, head_model_dir, torso_model_dir, inp):
+        if inp is None:
+            inp = {}
+        self.head_model_dir = head_model_dir
+        self.torso_model_dir = torso_model_dir
+        if torso_model_dir != '':
+            if torso_model_dir.endswith(".ckpt"):
+                set_hparams(f"{os.path.dirname(torso_model_dir)}/config.yaml", print_hparams=False)
+            else:
+                set_hparams(f"{torso_model_dir}/config.yaml", print_hparams=False)
+            if inp.get('head_torso_threshold', None) is not None:
+                hparams['htbsr_head_threshold'] = inp['head_torso_threshold']
+            self.secc2video_hparams = copy.deepcopy(hparams)
+            from modules.real3d.secc_img2plane_torso import OSAvatarSECC_Img2plane_Torso
+            model = OSAvatarSECC_Img2plane_Torso()
+            load_ckpt(model, f"{torso_model_dir}", model_name='model', strict=False)
+            if head_model_dir != '':
+                print("| Warning: Assigned --torso_ckpt which also contains head, but --head_ckpt is also assigned, skipping the --head_ckpt.")
+        else:
+            from modules.real3d.secc_img2plane_torso import OSAvatarSECC_Img2plane
+            if head_model_dir.endswith(".ckpt"):
+                set_hparams(f"{os.path.dirname(head_model_dir)}/config.yaml", print_hparams=False)
+            else:
+                set_hparams(f"{head_model_dir}/config.yaml", print_hparams=False)
+            if inp.get('head_torso_threshold', None) is not None:
+                hparams['htbsr_head_threshold'] = inp['head_torso_threshold']
+            self.secc2video_hparams = copy.deepcopy(hparams)
+            model = OSAvatarSECC_Img2plane()
+            load_ckpt(model, f"{head_model_dir}", model_name='model', strict=False)
+        return model
+    def infer_once(self, inp):
+        self.inp = inp
+        samples = self.prepare_batch_from_inp(inp)
+        seed = inp['seed'] if inp['seed'] is not None else int(time.time())
+        random.seed(seed)
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        out_name = self.forward_system(samples, inp)
+        return out_name
+    def prepare_batch_from_inp(self, inp):
+        """
+        :param inp: {'audio_source_name': (str)}
+        :return: a dict that contains the condition feature of NeRF
+        """
+        sample = {}
+        # Process Driving Motion
+        if inp['drv_audio_name'][-4:] in ['.wav', '.mp3']:
+            self.save_wav16k(inp['drv_audio_name'])
+            if self.audio2secc_hparams['audio_type'] == 'hubert':
+                hubert = self.get_hubert(self.wav16k_name)
+            elif self.audio2secc_hparams['audio_type'] == 'mfcc':
+                hubert = self.get_mfcc(self.wav16k_name) / 100
+            f0 = self.get_f0(self.wav16k_name)
+            if f0.shape[0] > len(hubert):
+                f0 = f0[:len(hubert)]
+            else:
+                num_to_pad = len(hubert) - len(f0)
+                f0 = np.pad(f0, pad_width=((0,num_to_pad), (0,0)))
+            t_x = hubert.shape[0]
+            x_mask = torch.ones([1, t_x]).float() # mask for audio frames
+            y_mask = torch.ones([1, t_x//2]).float() # mask for motion/image frames
+            sample.update({
+                'hubert': torch.from_numpy(hubert).float().unsqueeze(0).cuda(),
+                'f0': torch.from_numpy(f0).float().reshape([1,-1]).cuda(),
+                'x_mask': x_mask.cuda(),
+                'y_mask': y_mask.cuda(),
+                })
+            sample['blink'] = torch.zeros([1, t_x, 1]).long().cuda()
+            sample['audio'] = sample['hubert']
+            sample['eye_amp'] = torch.ones([1, 1]).cuda() * 1.0
+            sample['mouth_amp'] = torch.ones([1, 1]).cuda() * inp['mouth_amp']
+        elif inp['drv_audio_name'][-4:] in ['.mp4']:
+            drv_motion_coeff_dict = fit_3dmm_for_a_video(inp['drv_audio_name'], save=False)
+            drv_motion_coeff_dict = convert_to_tensor(drv_motion_coeff_dict)
+            t_x = drv_motion_coeff_dict['exp'].shape[0] * 2
+            self.drv_motion_coeff_dict = drv_motion_coeff_dict
+        elif inp['drv_audio_name'][-4:] in ['.npy']:
+            drv_motion_coeff_dict = np.load(inp['drv_audio_name'], allow_pickle=True).tolist()
+            drv_motion_coeff_dict = convert_to_tensor(drv_motion_coeff_dict)
+            t_x = drv_motion_coeff_dict['exp'].shape[0] * 2
+            self.drv_motion_coeff_dict = drv_motion_coeff_dict
+        # Face Parsing
+        image_name = inp['src_image_name']
+        if image_name.endswith(".mp4"):
+            img = read_first_frame_from_a_video(image_name)
+            image_name = inp['src_image_name'] = image_name[:-4] + '.png'
+            cv2.imwrite(image_name, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
+        sample['ref_gt_img'] = load_img_to_normalized_512_bchw_tensor(image_name).cuda()
+        img = load_img_to_512_hwc_array(image_name)
+        segmap = self.seg_model._cal_seg_map(img)
+        sample['segmap'] = torch.tensor(segmap).float().unsqueeze(0).cuda()
+        head_img = self.seg_model._seg_out_img_with_segmap(img, segmap, mode='head')[0]
+        sample['ref_head_img'] = ((torch.tensor(head_img) - 127.5)/127.5).float().unsqueeze(0).permute(0, 3, 1,2).cuda() # [b,c,h,w]
+        inpaint_torso_img, _, _, _ = inpaint_torso_job(img, segmap)
+        sample['ref_torso_img'] = ((torch.tensor(inpaint_torso_img) - 127.5)/127.5).float().unsqueeze(0).permute(0, 3, 1,2).cuda() # [b,c,h,w]
+        if inp['bg_image_name'] == '':
+            bg_img = extract_background([img], [segmap], 'knn')
+        else:
+            bg_img = cv2.imread(inp['bg_image_name'])
+            bg_img = cv2.cvtColor(bg_img, cv2.COLOR_BGR2RGB)
+            bg_img = cv2.resize(bg_img, (512,512))
+        sample['bg_img'] = ((torch.tensor(bg_img) - 127.5)/127.5).float().unsqueeze(0).permute(0, 3, 1,2).cuda() # [b,c,h,w]
+        # 3DMM, get identity code and camera pose
+        coeff_dict = fit_3dmm_for_a_image(image_name, save=False)
+        assert coeff_dict is not None
+        src_id = torch.tensor(coeff_dict['id']).reshape([1,80]).cuda()
+        src_exp = torch.tensor(coeff_dict['exp']).reshape([1,64]).cuda()
+        src_euler = torch.tensor(coeff_dict['euler']).reshape([1,3]).cuda()
+        src_trans = torch.tensor(coeff_dict['trans']).reshape([1,3]).cuda()
+        sample['id'] = src_id.repeat([t_x//2,1])
+        # get the src_kp for torso model
+        src_kp = self.face3d_helper.reconstruct_lm2d(src_id, src_exp, src_euler, src_trans) # [1, 68, 2]
+        src_kp = (src_kp-0.5) / 0.5 # rescale to -1~1
+        sample['src_kp'] = torch.clamp(src_kp, -1, 1).repeat([t_x//2,1,1])
+        # get camera pose file
+        # random.seed(time.time())
+        inp['drv_pose_name'] = inp['drv_pose_name']
+        print(f"| To extract pose from {inp['drv_pose_name']}")
+        # extract camera pose
+        if inp['drv_pose_name'] == 'static':
+            sample['euler'] = torch.tensor(coeff_dict['euler']).reshape([1,3]).cuda().repeat([t_x//2,1]) # default static pose
+            sample['trans'] = torch.tensor(coeff_dict['trans']).reshape([1,3]).cuda().repeat([t_x//2,1])
+        else: # from file
+            if inp['drv_pose_name'].endswith('.mp4'):
+                # extract coeff from video
+                drv_pose_coeff_dict = fit_3dmm_for_a_video(inp['drv_pose_name'], save=False)
+            else:
+                # load from npy
+                drv_pose_coeff_dict = np.load(inp['drv_pose_name'], allow_pickle=True).tolist()
+            print(f"| Extracted pose from {inp['drv_pose_name']}")
+            eulers = convert_to_tensor(drv_pose_coeff_dict['euler']).reshape([-1,3]).cuda()
+            trans = convert_to_tensor(drv_pose_coeff_dict['trans']).reshape([-1,3]).cuda()
+            len_pose = len(eulers)
+            index_lst = [mirror_index(i, len_pose) for i in range(t_x//2)]
+            sample['euler'] = eulers[index_lst]
+            sample['trans'] = trans[index_lst]
+        # fix the z axis
+        sample['trans'][:, -1] = sample['trans'][0:1, -1].repeat([sample['trans'].shape[0]])
+        # mapping to the init pose
+        if inp.get("map_to_init_pose", 'False') == 'True':
+            diff_euler = torch.tensor(coeff_dict['euler']).reshape([1,3]).cuda() - sample['euler'][0:1]
+            sample['euler'] = sample['euler'] + diff_euler
+            diff_trans = torch.tensor(coeff_dict['trans']).reshape([1,3]).cuda() - sample['trans'][0:1]
+            sample['trans'] = sample['trans'] + diff_trans
+        # prepare camera
+        camera_ret = get_eg3d_convention_camera_pose_intrinsic({'euler':sample['euler'].cpu(), 'trans':sample['trans'].cpu()})
+        c2w, intrinsics = camera_ret['c2w'], camera_ret['intrinsics']
+        # smooth camera
+        camera_smo_ksize = 7
+        camera = np.concatenate([c2w.reshape([-1,16]), intrinsics.reshape([-1,9])], axis=-1)
+        camera = smooth_camera_sequence(camera, kernel_size=camera_smo_ksize) # [T, 25]
+        camera = torch.tensor(camera).cuda().float()
+        sample['camera'] = camera
+        return sample
+    @torch.no_grad()
+    def get_hubert(self, wav16k_name):
+        from data_gen.utils.process_audio.extract_hubert import get_hubert_from_16k_wav
+        hubert = get_hubert_from_16k_wav(wav16k_name).detach().numpy()
+        len_mel = hubert.shape[0]
+        x_multiply = 8
+        if len_mel % x_multiply == 0:
+            num_to_pad = 0
+        else:
+            num_to_pad = x_multiply - len_mel % x_multiply
+        hubert = np.pad(hubert, pad_width=((0,num_to_pad), (0,0)))
+        return hubert
+    def get_mfcc(self, wav16k_name):
+        from utils.audio import librosa_wav2mfcc
+        hparams['fft_size'] = 1200
+        hparams['win_size'] = 1200
+        hparams['hop_size'] = 480
+        hparams['audio_num_mel_bins'] = 80
+        hparams['fmin'] = 80
+        hparams['fmax'] = 12000
+        hparams['audio_sample_rate'] = 24000
+        mfcc = librosa_wav2mfcc(wav16k_name,
+            fft_size=hparams['fft_size'],
+            hop_size=hparams['hop_size'],
+            win_length=hparams['win_size'],
+            num_mels=hparams['audio_num_mel_bins'],
+            fmin=hparams['fmin'],
+            fmax=hparams['fmax'],
+            sample_rate=hparams['audio_sample_rate'],
+            center=True)
+        mfcc = np.array(mfcc).reshape([-1, 13])
+        len_mel = mfcc.shape[0]
+        x_multiply = 8
+        if len_mel % x_multiply == 0:
+            num_to_pad = 0
+        else:
+            num_to_pad = x_multiply - len_mel % x_multiply
+        mfcc = np.pad(mfcc, pad_width=((0,num_to_pad), (0,0)))
+        return mfcc
+    @torch.no_grad()
+    def forward_audio2secc(self, batch, inp=None):
+        if inp['drv_audio_name'][-4:] in ['.wav', '.mp3']:
+            # audio-to-exp
+            ret = {}
+            pred = self.audio2secc_model.forward(batch, ret=ret,train=False, temperature=inp['temperature'],)
+            print("| audio-to-motion finished")
+            if pred.shape[-1] == 144:
+                id = ret['pred'][0][:,:80]
+                exp = ret['pred'][0][:,80:]
+            else:
+                id = batch['id']
+                exp = ret['pred'][0]
+            if len(id) < len(exp): # happens when use ICL
+                id = torch.cat([id, id[0].unsqueeze(0).repeat([len(exp)-len(id),1])])
+            batch['id'] = id
+            batch['exp'] = exp
+        else:
+            drv_motion_coeff_dict = self.drv_motion_coeff_dict
+            batch['exp'] = torch.FloatTensor(drv_motion_coeff_dict['exp']).cuda()
+        batch = self.get_driving_motion(batch['id'], batch['exp'], batch['euler'], batch['trans'], batch, inp)
+        if self.use_icl_audio2motion:
+            self.audio2secc_model.empty_context()
+        return batch
+    @torch.no_grad()
+    def get_driving_motion(self, id, exp, euler, trans, batch, inp):
+        zero_eulers = torch.zeros([id.shape[0], 3]).to(id.device)
+        zero_trans = torch.zeros([id.shape[0], 3]).to(exp.device)
+        # render the secc given the id,exp
+        with torch.no_grad():
+            chunk_size = 50
+            drv_secc_color_lst = []
+            num_iters = len(id)//chunk_size if len(id)%chunk_size == 0 else len(id)//chunk_size+1
+            for i in tqdm.trange(num_iters, desc="rendering drv secc"):
+                torch.cuda.empty_cache()
+                face_mask, drv_secc_color = self.secc_renderer(id[i*chunk_size:(i+1)*chunk_size], exp[i*chunk_size:(i+1)*chunk_size], zero_eulers[i*chunk_size:(i+1)*chunk_size], zero_trans[i*chunk_size:(i+1)*chunk_size])
+                drv_secc_color_lst.append(drv_secc_color.cpu())
+        drv_secc_colors = torch.cat(drv_secc_color_lst, dim=0)
+        _, src_secc_color = self.secc_renderer(id[0:1], exp[0:1], zero_eulers[0:1], zero_trans[0:1])
+        _, cano_secc_color = self.secc_renderer(id[0:1], exp[0:1]*0, zero_eulers[0:1], zero_trans[0:1])
+        batch['drv_secc'] = drv_secc_colors.cuda()
+        batch['src_secc'] = src_secc_color.cuda()
+        batch['cano_secc'] = cano_secc_color.cuda()
+        # blinking secc
+        if inp['blink_mode'] == 'period':
+            period = 5 # second
+            for i in tqdm.trange(len(drv_secc_colors),desc="blinking secc"):
+                if i % (25*period) == 0:
+                    blink_dur_frames = random.randint(8, 12)
+                    for offset in range(blink_dur_frames):
+                        j = offset + i
+                        if j >= len(drv_secc_colors)-1: break
+                        def blink_percent_fn(t, T):
+                            return -4/T**2 * t**2 + 4/T * t
+                        blink_percent = blink_percent_fn(offset, blink_dur_frames)
+                        secc = batch['drv_secc'][j]
+                        out_secc = blink_eye_for_secc(secc, blink_percent)
+                        out_secc = out_secc.cuda()
+                        batch['drv_secc'][j] = out_secc
+        # get the drv_kp for torso model, using the transformed trajectory
+        drv_kp = self.face3d_helper.reconstruct_lm2d(id, exp, euler, trans) # [T, 68, 2]
+        drv_kp = (drv_kp-0.5) / 0.5 # rescale to -1~1
+        batch['drv_kp'] = torch.clamp(drv_kp, -1, 1)
+        return batch
+    @torch.no_grad()
+    def forward_secc2video(self, batch, inp=None):
+        num_frames = len(batch['drv_secc'])
+        camera = batch['camera']
+        src_kps = batch['src_kp']
+        drv_kps = batch['drv_kp']
+        cano_secc_color = batch['cano_secc']
+        src_secc_color = batch['src_secc']
+        drv_secc_colors = batch['drv_secc']
+        ref_img_gt = batch['ref_gt_img']
+        ref_img_head = batch['ref_head_img']
+        ref_torso_img = batch['ref_torso_img']
+        bg_img = batch['bg_img']
+        segmap = batch['segmap']
+        # smooth torso drv_kp
+        torso_smo_ksize = 7
+        drv_kps = smooth_features_xd(drv_kps.reshape([-1, 68*2]), kernel_size=torso_smo_ksize).reshape([-1, 68, 2])
+        # forward renderer
+        img_raw_lst = []
+        img_lst = []
+        depth_img_lst = []
+        with torch.no_grad():
+            for i in tqdm.trange(num_frames, desc="Real3D-Portrait is rendering frames"):
+                kp_src = torch.cat([src_kps[i:i+1].reshape([1, 68, 2]), torch.zeros([1, 68,1]).to(src_kps.device)],dim=-1)
+                kp_drv = torch.cat([drv_kps[i:i+1].reshape([1, 68, 2]), torch.zeros([1, 68,1]).to(drv_kps.device)],dim=-1)
+                cond={'cond_cano': cano_secc_color,'cond_src': src_secc_color, 'cond_tgt': drv_secc_colors[i:i+1].cuda(),
+                        'ref_torso_img': ref_torso_img, 'bg_img': bg_img, 'segmap': segmap,
+                        'kp_s': kp_src, 'kp_d': kp_drv}
+                if i == 0:
+                    gen_output = self.secc2video_model.forward(img=ref_img_head, camera=camera[i:i+1], cond=cond, ret={}, cache_backbone=True, use_cached_backbone=False)
+                else:
+                    gen_output = self.secc2video_model.forward(img=ref_img_head, camera=camera[i:i+1], cond=cond, ret={}, cache_backbone=False, use_cached_backbone=True)
+                img_lst.append(gen_output['image'])
+                img_raw_lst.append(gen_output['image_raw'])
+                depth_img_lst.append(gen_output['image_depth'])
+        # save demo video
+        depth_imgs = torch.cat(depth_img_lst)
+        imgs = torch.cat(img_lst)
+        imgs_raw = torch.cat(img_raw_lst)
+        secc_img = torch.cat([torch.nn.functional.interpolate(drv_secc_colors[i:i+1], (512,512)) for i in range(num_frames)])
+        if inp['out_mode'] == 'concat_debug':
+            secc_img = secc_img.cpu()
+            secc_img = ((secc_img + 1) * 127.5).permute(0, 2, 3, 1).int().numpy()
+            depth_img = F.interpolate(depth_imgs, (512,512)).cpu()
+            depth_img = depth_img.repeat([1,3,1,1])
+            depth_img = (depth_img - depth_img.min()) / (depth_img.max() - depth_img.min())
+            depth_img = depth_img * 2 - 1
+            depth_img = depth_img.clamp(-1,1)
+            secc_img = secc_img / 127.5 - 1
+            secc_img = torch.from_numpy(secc_img).permute(0, 3, 1, 2)
+            imgs = torch.cat([ref_img_gt.repeat([imgs.shape[0],1,1,1]).cpu(), secc_img, F.interpolate(imgs_raw, (512,512)).cpu(), depth_img, imgs.cpu()], dim=-1)
+        elif inp['out_mode'] == 'final':
+            imgs = imgs.cpu()
+        elif inp['out_mode'] == 'debug':
+            raise NotImplementedError("to do: save separate videos")
+        imgs = imgs.clamp(-1,1)
+        import imageio
+        debug_name = 'demo.mp4'
+        out_imgs = ((imgs.permute(0, 2, 3, 1) + 1)/2 * 255).int().cpu().numpy().astype(np.uint8)
+        writer = imageio.get_writer(debug_name, fps=25, format='FFMPEG', codec='h264')
+        for i in tqdm.trange(len(out_imgs), desc="Imageio is saving video"):
+            writer.append_data(out_imgs[i])
+        writer.close()
+        out_fname = 'infer_out/tmp/' + os.path.basename(inp['src_image_name'])[:-4] + '_' + os.path.basename(inp['drv_pose_name'])[:-4] + '.mp4' if inp['out_name'] == '' else inp['out_name']
+        try:
+            os.makedirs(os.path.dirname(out_fname), exist_ok=True)
+        except: pass
+        if inp['drv_audio_name'][-4:] in ['.wav', '.mp3']:
+            os.system(f"ffmpeg -i {debug_name} -i {self.wav16k_name} -y -v quiet -shortest {out_fname}")
+            os.system(f"rm {debug_name}")
+            os.system(f"rm {self.wav16k_name}")
+        else:
+            ret = os.system(f"ffmpeg -i {debug_name} -i {inp['drv_audio_name']} -map 0:v -map 1:a -y -v quiet -shortest {out_fname}")
+            if ret != 0: # 没有成功从drv_audio_name里面提取到音频, 则直接输出无音频轨道的纯视频
+                os.system(f"mv {debug_name} {out_fname}")
+        print(f"Saved at {out_fname}")
+        return out_fname
+    @torch.no_grad()
+    def forward_system(self, batch, inp):
+        self.forward_audio2secc(batch, inp)
+        out_fname = self.forward_secc2video(batch, inp)
+        return out_fname
+    @classmethod
+    def example_run(cls, inp=None):
+        inp_tmp = {
+            'drv_audio_name': 'data/raw/val_wavs/zozo.wav',
+            'src_image_name': 'data/raw/val_imgs/Macron.png'
+            }
+        if inp is not None:
+            inp_tmp.update(inp)
+        inp = inp_tmp
+        infer_instance = cls(inp['a2m_ckpt'], inp['head_ckpt'], inp['torso_ckpt'], inp=inp)
+        infer_instance.infer_once(inp)
+    ##############
+    # IO-related
+    ##############
+    def save_wav16k(self, audio_name):
+        supported_types = ('.wav', '.mp3', '.mp4', '.avi')
+        assert audio_name.endswith(supported_types), f"Now we only support {','.join(supported_types)} as audio source!"
+        wav16k_name = audio_name[:-4] + '_16k.wav'
+        self.wav16k_name = wav16k_name
+        extract_wav_cmd = f"ffmpeg -i {audio_name} -f wav -ar 16000 -v quiet -y {wav16k_name} -y"
+        os.system(extract_wav_cmd)
+        print(f"Extracted wav file (16khz) from {audio_name} to {wav16k_name}.")
+    def get_f0(self, wav16k_name):
+        from data_gen.utils.process_audio.extract_mel_f0 import extract_mel_from_fname, extract_f0_from_wav_and_mel
+        wav, mel = extract_mel_from_fname(self.wav16k_name)
+        f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
+        f0 = f0.reshape([-1,1])
+        return f0
+if __name__ == '__main__':
+    import argparse, glob, tqdm
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--a2m_ckpt", default='checkpoints/240126_real3dportrait_orig/audio2secc_vae', type=str)
+    parser.add_argument("--head_ckpt", default='', type=str)
+    parser.add_argument("--torso_ckpt", default='checkpoints/240126_real3dportrait_orig/secc2plane_torso_orig', type=str)
+    parser.add_argument("--src_img", default='', type=str) # data/raw/examples/Macron.png
+    parser.add_argument("--bg_img", default='', type=str) # data/raw/examples/bg.png
+    parser.add_argument("--drv_aud", default='', type=str) # data/raw/examples/Obama_5s.wav
+    parser.add_argument("--drv_pose", default='static', type=str) # data/raw/examples/May_5s.mp4
+    parser.add_argument("--blink_mode", default='none', type=str) # none | period
+    parser.add_argument("--temperature", default=0.2, type=float) # sampling temperature in audio2motion, higher -> more diverse, less accurate
+    parser.add_argument("--mouth_amp", default=0.45, type=float) # scale of predicted mouth, enabled in audio-driven
+    parser.add_argument("--head_torso_threshold", default=0.9, type=float, help="0.1~1.0, turn up this value if the hair is translucent")
+    parser.add_argument("--out_name", default='') # output filename
+    parser.add_argument("--out_mode", default='final') # final: only output talking head video; concat_debug: talking head with internel features
+    parser.add_argument("--map_to_init_pose", default='True') # whether to map the pose of first frame to source image
+    parser.add_argument("--seed", default=None, type=int) # random seed, default None to use time.time()
+    args = parser.parse_args()
+    inp = {
+            'a2m_ckpt': args.a2m_ckpt,
+            'head_ckpt': args.head_ckpt,
+            'torso_ckpt': args.torso_ckpt,
+            'src_image_name': args.src_img,
+            'bg_image_name': args.bg_img,
+            'drv_audio_name': args.drv_aud,
+            'drv_pose_name': args.drv_pose,
+            'blink_mode': args.blink_mode,
+            'temperature': args.temperature,
+            'mouth_amp': args.mouth_amp,
+            'out_name': args.out_name,
+            'out_mode': args.out_mode,
+            'map_to_init_pose': args.map_to_init_pose,
+            'head_torso_threshold': args.head_torso_threshold,
+            'seed': args.seed,
+            }
+    GeneFace2Infer.example_run(inp)

insta.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#conda create -n real3dportrait python=3.9
+#conda activate real3dportrait
+conda install conda-forge::ffmpeg # ffmpeg with libx264 codec to turn images to video
+### We recommend torch2.0.1+cuda11.7.
+conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia
+# Build from source, it may take a long time (Proxy is recommended if encountering the time-out problem)
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+# MMCV for some network structure
+pip install cython
+pip install openmim==0.3.9
+mim install mmcv==2.1.0 # use mim to speed up installation for mmcv
+# other dependencies
+pip install -r docs/prepare_env/requirements.txt -v

modules/audio2motion/cnn_models.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def init_weights_func(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv1d") != -1:
+        torch.nn.init.xavier_uniform_(m.weight)
+class LambdaLayer(nn.Module):
+    def __init__(self, lambd):
+        super(LambdaLayer, self).__init__()
+        self.lambd = lambd
+    def forward(self, x):
+        return self.lambd(x)
+class LayerNorm(torch.nn.LayerNorm):
+    """Layer normalization module.
+    :param int nout: output dim size
+    :param int dim: dimension to be normalized
+    """
+    def __init__(self, nout, dim=-1, eps=1e-5):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=eps)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        :param torch.Tensor x: input tensor
+        :return: layer normalized tensor
+        :rtype torch.Tensor
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
+class ResidualBlock(nn.Module):
+    """Implements conv->PReLU->norm n-times"""
+    def __init__(self, channels, kernel_size, dilation, n=2, norm_type='bn', dropout=0.0,
+                 c_multiple=2, ln_eps=1e-12, bias=False):
+        super(ResidualBlock, self).__init__()
+        if norm_type == 'bn':
+            norm_builder = lambda: nn.BatchNorm1d(channels)
+        elif norm_type == 'in':
+            norm_builder = lambda: nn.InstanceNorm1d(channels, affine=True)
+        elif norm_type == 'gn':
+            norm_builder = lambda: nn.GroupNorm(8, channels)
+        elif norm_type == 'ln':
+            norm_builder = lambda: LayerNorm(channels, dim=1, eps=ln_eps)
+        else:
+            norm_builder = lambda: nn.Identity()
+        self.blocks = [
+            nn.Sequential(
+                norm_builder(),
+                nn.Conv1d(channels, c_multiple * channels, kernel_size, dilation=dilation,
+                          padding=(dilation * (kernel_size - 1)) // 2, bias=bias),
+                LambdaLayer(lambda x: x * kernel_size ** -0.5),
+                nn.GELU(),
+                nn.Conv1d(c_multiple * channels, channels, 1, dilation=dilation, bias=bias),
+            )
+            for _ in range(n)
+        ]
+        self.blocks = nn.ModuleList(self.blocks)
+        self.dropout = dropout
+    def forward(self, x):
+        nonpadding = (x.abs().sum(1) > 0).float()[:, None, :]
+        for b in self.blocks:
+            x_ = b(x)
+            if self.dropout > 0 and self.training:
+                x_ = F.dropout(x_, self.dropout, training=self.training)
+            x = x + x_
+            x = x * nonpadding
+        return x
+class ConvBlocks(nn.Module):
+    """Decodes the expanded phoneme encoding into spectrograms"""
+    def __init__(self, channels, out_dims, dilations, kernel_size,
+                 norm_type='ln', layers_in_block=2, c_multiple=2,
+                 dropout=0.0, ln_eps=1e-5, init_weights=True, is_BTC=True, bias=False):
+        super(ConvBlocks, self).__init__()
+        self.is_BTC = is_BTC
+        self.res_blocks = nn.Sequential(
+            *[ResidualBlock(channels, kernel_size, d,
+                            n=layers_in_block, norm_type=norm_type, c_multiple=c_multiple,
+                            dropout=dropout, ln_eps=ln_eps, bias=bias)
+              for d in dilations],
+        )
+        if norm_type == 'bn':
+            norm = nn.BatchNorm1d(channels)
+        elif norm_type == 'in':
+            norm = nn.InstanceNorm1d(channels, affine=True)
+        elif norm_type == 'gn':
+            norm = nn.GroupNorm(8, channels)
+        elif norm_type == 'ln':
+            norm = LayerNorm(channels, dim=1, eps=ln_eps)
+        self.last_norm = norm
+        self.post_net1 = nn.Conv1d(channels, out_dims, kernel_size=3, padding=1, bias=bias)
+        if init_weights:
+            self.apply(init_weights_func)
+    def forward(self, x):
+        """
+        :param x: [B, T, H]
+        :return:  [B, T, H]
+        """
+        if self.is_BTC:
+            x = x.transpose(1, 2) # [B, C, T]
+        nonpadding = (x.abs().sum(1) > 0).float()[:, None, :]
+        x = self.res_blocks(x) * nonpadding
+        x = self.last_norm(x) * nonpadding
+        x = self.post_net1(x) * nonpadding
+        if self.is_BTC:
+            x = x.transpose(1, 2)
+        return x
+class SeqLevelConvolutionalModel(nn.Module):
+    def __init__(self, out_dim=64, dropout=0.5, audio_feat_type='ppg', backbone_type='unet', norm_type='bn'):
+        nn.Module.__init__(self)
+        self.audio_feat_type = audio_feat_type
+        if audio_feat_type == 'ppg':
+            self.audio_encoder = nn.Sequential(*[
+                nn.Conv1d(29, 48, 3, 1, 1, bias=False),
+                nn.BatchNorm1d(48) if norm_type=='bn' else LayerNorm(48, dim=1),
+                nn.GELU(),
+                nn.Conv1d(48, 48, 3, 1, 1, bias=False)
+            ])
+            self.energy_encoder = nn.Sequential(*[
+                nn.Conv1d(1, 16, 3, 1, 1, bias=False),
+                nn.BatchNorm1d(16) if norm_type=='bn' else LayerNorm(16, dim=1),
+                nn.GELU(),
+                nn.Conv1d(16, 16, 3, 1, 1, bias=False)
+            ])
+        elif audio_feat_type == 'mel':
+            self.mel_encoder = nn.Sequential(*[
+                nn.Conv1d(80, 64, 3, 1, 1, bias=False),
+                nn.BatchNorm1d(64) if norm_type=='bn' else LayerNorm(64, dim=1),
+                nn.GELU(),
+                nn.Conv1d(64, 64, 3, 1, 1, bias=False)
+            ])
+        else:
+            raise NotImplementedError("now only ppg or mel are supported!")
+        self.style_encoder = nn.Sequential(*[
+            nn.Linear(135, 256),
+            nn.GELU(),
+            nn.Linear(256, 256)
+        ])
+        if backbone_type == 'resnet':
+            self.backbone = ResNetBackbone()
+        elif backbone_type == 'unet':
+            self.backbone = UNetBackbone()
+        elif backbone_type == 'resblocks':
+            self.backbone = ResBlocksBackbone()
+        else:
+            raise NotImplementedError("Now only resnet and unet are supported!")
+        self.out_layer = nn.Sequential(
+            nn.BatchNorm1d(512) if norm_type=='bn' else LayerNorm(512, dim=1),
+            nn.Conv1d(512, 64, 3, 1, 1, bias=False),
+            nn.PReLU(),
+            nn.Conv1d(64, out_dim, 3, 1, 1, bias=False)
+        )
+        self.feat_dropout = nn.Dropout(p=dropout)
+    @property
+    def device(self):
+        return self.backbone.parameters().__next__().device
+    def forward(self, batch, ret, log_dict=None):
+        style, x_mask = batch['style'].to(self.device), batch['x_mask'].to(self.device)
+        style_feat = self.style_encoder(style)  # [B,C=135] => [B,C=128]
+        if self.audio_feat_type == 'ppg':
+            audio, energy = batch['audio'].to(self.device), batch['energy'].to(self.device)
+            audio_feat = self.audio_encoder(audio.transpose(1,2)).transpose(1,2) * x_mask.unsqueeze(2)  # [B,T,C=29] => [B,T,C=48]
+            energy_feat = self.energy_encoder(energy.transpose(1,2)).transpose(1,2) * x_mask.unsqueeze(2)  # [B,T,C=1] => [B,T,C=16]
+            feat = torch.cat([audio_feat, energy_feat], dim=2) # [B,T,C=48+16]
+        elif self.audio_feat_type == 'mel':
+            mel = batch['mel'].to(self.device)
+            feat = self.mel_encoder(mel.transpose(1,2)).transpose(1,2) * x_mask.unsqueeze(2) # [B,T,C=64]
+        feat, x_mask = self.backbone(x=feat, sty=style_feat, x_mask=x_mask)
+        out = self.out_layer(feat.transpose(1,2)).transpose(1,2) * x_mask.unsqueeze(2)  # [B,T//2,C=256] => [B,T//2,C=64]
+        ret['pred'] = out
+        ret['mask'] = x_mask
+        return out
+class ResBlocksBackbone(nn.Module):
+    def __init__(self, in_dim=64, out_dim=512, p_dropout=0.5, norm_type='bn'):
+        super(ResBlocksBackbone,self).__init__()
+        self.resblocks_0 = ConvBlocks(channels=in_dim, out_dims=64, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_1 = ConvBlocks(channels=64, out_dims=128, dilations=[1]*4, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_2 = ConvBlocks(channels=128, out_dims=256, dilations=[1]*14, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_3 = ConvBlocks(channels=512, out_dims=512, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_4 = ConvBlocks(channels=512, out_dims=out_dim, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.downsampler = LambdaLayer(lambda x: F.interpolate(x, scale_factor=0.5, mode='linear'))
+        self.upsampler = LambdaLayer(lambda x: F.interpolate(x, scale_factor=4, mode='linear'))
+        self.dropout = nn.Dropout(p=p_dropout)
+    def forward(self, x, sty, x_mask=1.):
+        """
+        x: [B, T, C]
+        sty: [B, C=256]
+        x_mask: [B, T]
+        ret: [B, T/2, C]
+        """
+        x = x.transpose(1, 2)  # [B, C, T]
+        x_mask = x_mask[:, None, :] # [B, 1, T]
+        x = self.resblocks_0(x) * x_mask # [B, C, T]
+        x_mask = self.downsampler(x_mask) # [B, 1, T/2]
+        x = self.downsampler(x) * x_mask # [B, C, T/2]
+        x = self.resblocks_1(x) * x_mask # [B, C, T/2]
+        x = self.resblocks_2(x) * x_mask # [B, C, T/2]
+        x = self.dropout(x.transpose(1,2)).transpose(1,2)
+        sty = sty[:, :, None].repeat([1,1,x_mask.shape[2]]) # [B,C=256,T/2]
+        x = torch.cat([x, sty], dim=1) # [B, C=256+256, T/2]
+        x = self.resblocks_3(x) * x_mask # [B, C, T/2]
+        x = self.resblocks_4(x) * x_mask # [B, C, T/2]
+        x = x.transpose(1,2)
+        x_mask = x_mask.squeeze(1)
+        return x, x_mask
+class ResNetBackbone(nn.Module):
+    def __init__(self, in_dim=64, out_dim=512, p_dropout=0.5, norm_type='bn'):
+        super(ResNetBackbone,self).__init__()
+        self.resblocks_0 = ConvBlocks(channels=in_dim, out_dims=64, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_1 = ConvBlocks(channels=64, out_dims=128, dilations=[1]*4, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_2 = ConvBlocks(channels=128, out_dims=256, dilations=[1]*14, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_3 = ConvBlocks(channels=512, out_dims=512, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_4 = ConvBlocks(channels=512, out_dims=out_dim, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.downsampler = LambdaLayer(lambda x: F.interpolate(x, scale_factor=0.5, mode='linear'))
+        self.upsampler = LambdaLayer(lambda x: F.interpolate(x, scale_factor=4, mode='linear'))
+        self.dropout = nn.Dropout(p=p_dropout)
+    def forward(self, x, sty, x_mask=1.):
+        """
+        x: [B, T, C]
+        sty: [B, C=256]
+        x_mask: [B, T]
+        ret: [B, T/2, C]
+        """
+        x = x.transpose(1, 2)  # [B, C, T]
+        x_mask = x_mask[:, None, :] # [B, 1, T]
+        x = self.resblocks_0(x) * x_mask # [B, C, T]
+        x_mask = self.downsampler(x_mask) # [B, 1, T/2]
+        x = self.downsampler(x) * x_mask # [B, C, T/2]
+        x = self.resblocks_1(x) * x_mask # [B, C, T/2]
+        x_mask = self.downsampler(x_mask) # [B, 1, T/4]
+        x = self.downsampler(x) * x_mask # [B, C, T/4]
+        x = self.resblocks_2(x) * x_mask # [B, C, T/4]
+        x_mask = self.downsampler(x_mask) # [B, 1, T/8]
+        x = self.downsampler(x) * x_mask # [B, C, T/8]
+        x = self.dropout(x.transpose(1,2)).transpose(1,2)
+        sty = sty[:, :, None].repeat([1,1,x_mask.shape[2]]) # [B,C=256,T/8]
+        x = torch.cat([x, sty], dim=1) # [B, C=256+256, T/8]
+        x = self.resblocks_3(x) * x_mask # [B, C, T/8]
+        x_mask = self.upsampler(x_mask) # [B, 1, T/2]
+        x = self.upsampler(x) * x_mask # [B, C, T/2]
+        x = self.resblocks_4(x) * x_mask # [B, C, T/2]
+        x = x.transpose(1,2)
+        x_mask = x_mask.squeeze(1)
+        return x, x_mask
+class UNetBackbone(nn.Module):
+    def __init__(self, in_dim=64, out_dim=512, p_dropout=0.5, norm_type='bn'):
+        super(UNetBackbone, self).__init__()
+        self.resblocks_0 = ConvBlocks(channels=in_dim, out_dims=64, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_1 = ConvBlocks(channels=64, out_dims=128, dilations=[1]*4, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_2 = ConvBlocks(channels=128, out_dims=256, dilations=[1]*8, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_3 = ConvBlocks(channels=512, out_dims=512, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False)
+        self.resblocks_4 = ConvBlocks(channels=768, out_dims=512, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False) # [768 = c3(512) + c2(256)]
+        self.resblocks_5 = ConvBlocks(channels=640, out_dims=out_dim, dilations=[1]*3, kernel_size=3, norm_type=norm_type, is_BTC=False) # [640 = c4(512) + c1(128)]
+        self.downsampler = nn.Upsample(scale_factor=0.5, mode='linear')
+        self.upsampler = nn.Upsample(scale_factor=2, mode='linear')
+        self.dropout = nn.Dropout(p=p_dropout)
+    def forward(self, x, sty, x_mask=1.):
+        """
+        x: [B, T, C]
+        sty: [B, C=256]
+        x_mask: [B, T]
+        ret: [B, T/2, C]
+        """
+        x = x.transpose(1, 2)  # [B, C, T]
+        x_mask = x_mask[:, None, :] # [B, 1, T]
+        x0 = self.resblocks_0(x) * x_mask # [B, C, T]
+        x_mask = self.downsampler(x_mask) # [B, 1, T/2]
+        x = self.downsampler(x0) * x_mask # [B, C, T/2]
+        x1 = self.resblocks_1(x) * x_mask # [B, C, T/2]
+        x_mask = self.downsampler(x_mask) # [B, 1, T/4]
+        x = self.downsampler(x1) * x_mask # [B, C, T/4]
+        x2 = self.resblocks_2(x) * x_mask # [B, C, T/4]
+        x_mask = self.downsampler(x_mask) # [B, 1, T/8]
+        x = self.downsampler(x2) * x_mask # [B, C, T/8]
+        x = self.dropout(x.transpose(1,2)).transpose(1,2)
+        sty = sty[:, :, None].repeat([1,1,x_mask.shape[2]]) # [B,C=256,T/8]
+        x = torch.cat([x, sty], dim=1) # [B, C=256+256, T/8]
+        x3 = self.resblocks_3(x) * x_mask # [B, C, T/8]
+        x_mask = self.upsampler(x_mask) # [B, 1, T/4]
+        x = self.upsampler(x3) * x_mask # [B, C, T/4]
+        x = torch.cat([x, self.dropout(x2.transpose(1,2)).transpose(1,2)], dim=1) #
+        x4 = self.resblocks_4(x) * x_mask # [B, C, T/4]
+        x_mask = self.upsampler(x_mask) # [B, 1, T/2]
+        x = self.upsampler(x4) * x_mask # [B, C, T/2]
+        x = torch.cat([x, self.dropout(x1.transpose(1,2)).transpose(1,2)], dim=1)
+        x5 = self.resblocks_5(x) * x_mask # [B, C, T/2]
+        x = x5.transpose(1,2)
+        x_mask = x_mask.squeeze(1)
+        return x, x_mask
+if __name__ == '__main__':
+    pass

modules/audio2motion/flow_base.py ADDED Viewed

	@@ -0,0 +1,838 @@

+import scipy
+from scipy import linalg
+from torch.nn import functional as F
+import torch
+from torch import nn
+import numpy as np
+import modules.audio2motion.utils as utils
+from modules.audio2motion.transformer_models import FFTBlocks
+from utils.commons.hparams import hparams
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+class WN(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0,
+                 p_dropout=0, share_cond_layers=False):
+        super(WN, self).__init__()
+        assert (kernel_size % 2 == 1)
+        assert (hidden_channels % 2 == 0)
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.share_cond_layers = share_cond_layers
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        self.use_adapters = hparams.get("use_adapters", False)
+        if self.use_adapters:
+            self.adapter_layers = torch.nn.ModuleList()
+        if gin_channels != 0 and not share_cond_layers:
+            cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+        for i in range(n_layers):
+            dilation = dilation_rate ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
+                                       dilation=dilation, padding=padding)
+            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+            if self.use_adapters:
+                adapter_layer = MlpAdapter(in_out_dim=res_skip_channels, hid_dim=res_skip_channels//4)
+                self.adapter_layers.append(adapter_layer)
+    def forward(self, x, x_mask=None, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        if g is not None and not self.share_cond_layers:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            x_in = self.drop(x_in)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if self.use_adapters:
+                res_skip_acts = self.adapter_layers[i](res_skip_acts.transpose(1,2)).transpose(1,2)
+            if i < self.n_layers - 1:
+                x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels:, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+    def remove_weight_norm(self):
+        def remove_weight_norm(m):
+            try:
+                nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(remove_weight_norm)
+    def enable_adapters(self):
+        if not self.use_adapters:
+            return
+        for adapter_layer in self.adapter_layers:
+            adapter_layer.enable()
+    def disable_adapters(self):
+        if not self.use_adapters:
+            return
+        for adapter_layer in self.adapter_layers:
+            adapter_layer.disable()
+class Permute(nn.Module):
+    def __init__(self, *args):
+        super(Permute, self).__init__()
+        self.args = args
+    def forward(self, x):
+        return x.permute(self.args)
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-4):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+class ConvReluNorm(nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(
+            nn.ReLU(),
+            nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+class ActNorm(nn.Module):
+    def __init__(self, channels, ddi=False, **kwargs):
+        super().__init__()
+        self.channels = channels
+        self.initialized = not ddi
+        self.logs = nn.Parameter(torch.zeros(1, channels, 1))
+        self.bias = nn.Parameter(torch.zeros(1, channels, 1))
+    def forward(self, x, x_mask=None, reverse=False, **kwargs):
+        if x_mask is None:
+            x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype)
+        x_len = torch.sum(x_mask, [1, 2])
+        if not self.initialized:
+            self.initialize(x, x_mask)
+            self.initialized = True
+        if reverse:
+            z = (x - self.bias) * torch.exp(-self.logs) * x_mask
+            logdet = torch.sum(-self.logs) * x_len
+        else:
+            z = (self.bias + torch.exp(self.logs) * x) * x_mask
+            logdet = torch.sum(self.logs) * x_len  # [b]
+        return z, logdet
+    def store_inverse(self):
+        pass
+    def set_ddi(self, ddi):
+        self.initialized = not ddi
+    def initialize(self, x, x_mask):
+        with torch.no_grad():
+            denom = torch.sum(x_mask, [0, 2])
+            m = torch.sum(x * x_mask, [0, 2]) / denom
+            m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom
+            v = m_sq - (m ** 2)
+            logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6))
+            bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype)
+            logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype)
+            self.bias.data.copy_(bias_init)
+            self.logs.data.copy_(logs_init)
+class InvConvNear(nn.Module):
+    def __init__(self, channels, n_split=4, no_jacobian=False, lu=True, n_sqz=2, **kwargs):
+        super().__init__()
+        assert (n_split % 2 == 0)
+        self.channels = channels
+        self.n_split = n_split
+        self.n_sqz = n_sqz
+        self.no_jacobian = no_jacobian
+        w_init = torch.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0]
+        if torch.det(w_init) < 0:
+            w_init[:, 0] = -1 * w_init[:, 0]
+        self.lu = lu
+        if lu:
+            # LU decomposition can slightly speed up the inverse
+            np_p, np_l, np_u = linalg.lu(w_init)
+            np_s = np.diag(np_u)
+            np_sign_s = np.sign(np_s)
+            np_log_s = np.log(np.abs(np_s))
+            np_u = np.triu(np_u, k=1)
+            l_mask = np.tril(np.ones(w_init.shape, dtype=float), -1)
+            eye = np.eye(*w_init.shape, dtype=float)
+            self.register_buffer('p', torch.Tensor(np_p.astype(float)))
+            self.register_buffer('sign_s', torch.Tensor(np_sign_s.astype(float)))
+            self.l = nn.Parameter(torch.Tensor(np_l.astype(float)), requires_grad=True)
+            self.log_s = nn.Parameter(torch.Tensor(np_log_s.astype(float)), requires_grad=True)
+            self.u = nn.Parameter(torch.Tensor(np_u.astype(float)), requires_grad=True)
+            self.register_buffer('l_mask', torch.Tensor(l_mask))
+            self.register_buffer('eye', torch.Tensor(eye))
+        else:
+            self.weight = nn.Parameter(w_init)
+    def forward(self, x, x_mask=None, reverse=False, **kwargs):
+        b, c, t = x.size()
+        assert (c % self.n_split == 0)
+        if x_mask is None:
+            x_mask = 1
+            x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t
+        else:
+            x_len = torch.sum(x_mask, [1, 2])
+        x = x.view(b, self.n_sqz, c // self.n_split, self.n_split // self.n_sqz, t)
+        x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t)
+        if self.lu:
+            self.weight, log_s = self._get_weight()
+            logdet = log_s.sum()
+            logdet = logdet * (c / self.n_split) * x_len
+        else:
+            logdet = torch.logdet(self.weight) * (c / self.n_split) * x_len  # [b]
+        if reverse:
+            if hasattr(self, "weight_inv"):
+                weight = self.weight_inv
+            else:
+                weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype)
+            logdet = -logdet
+        else:
+            weight = self.weight
+            if self.no_jacobian:
+                logdet = 0
+        weight = weight.view(self.n_split, self.n_split, 1, 1)
+        z = F.conv2d(x, weight)
+        z = z.view(b, self.n_sqz, self.n_split // self.n_sqz, c // self.n_split, t)
+        z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask
+        return z, logdet
+    def _get_weight(self):
+        l, log_s, u = self.l, self.log_s, self.u
+        l = l * self.l_mask + self.eye
+        u = u * self.l_mask.transpose(0, 1).contiguous() + torch.diag(self.sign_s * torch.exp(log_s))
+        weight = torch.matmul(self.p, torch.matmul(l, u))
+        return weight, log_s
+    def store_inverse(self):
+        weight, _ = self._get_weight()
+        self.weight_inv = torch.inverse(weight.float()).to(next(self.parameters()).device)
+class InvConv(nn.Module):
+    def __init__(self, channels, no_jacobian=False, lu=True, **kwargs):
+        super().__init__()
+        w_shape = [channels, channels]
+        w_init = np.linalg.qr(np.random.randn(*w_shape))[0].astype(float)
+        LU_decomposed = lu
+        if not LU_decomposed:
+            # Sample a random orthogonal matrix:
+            self.register_parameter("weight", nn.Parameter(torch.Tensor(w_init)))
+        else:
+            np_p, np_l, np_u = linalg.lu(w_init)
+            np_s = np.diag(np_u)
+            np_sign_s = np.sign(np_s)
+            np_log_s = np.log(np.abs(np_s))
+            np_u = np.triu(np_u, k=1)
+            l_mask = np.tril(np.ones(w_shape, dtype=float), -1)
+            eye = np.eye(*w_shape, dtype=float)
+            self.register_buffer('p', torch.Tensor(np_p.astype(float)))
+            self.register_buffer('sign_s', torch.Tensor(np_sign_s.astype(float)))
+            self.l = nn.Parameter(torch.Tensor(np_l.astype(float)))
+            self.log_s = nn.Parameter(torch.Tensor(np_log_s.astype(float)))
+            self.u = nn.Parameter(torch.Tensor(np_u.astype(float)))
+            self.l_mask = torch.Tensor(l_mask)
+            self.eye = torch.Tensor(eye)
+        self.w_shape = w_shape
+        self.LU = LU_decomposed
+        self.weight = None
+    def get_weight(self, device, reverse):
+        w_shape = self.w_shape
+        self.p = self.p.to(device)
+        self.sign_s = self.sign_s.to(device)
+        self.l_mask = self.l_mask.to(device)
+        self.eye = self.eye.to(device)
+        l = self.l * self.l_mask + self.eye
+        u = self.u * self.l_mask.transpose(0, 1).contiguous() + torch.diag(self.sign_s * torch.exp(self.log_s))
+        dlogdet = self.log_s.sum()
+        if not reverse:
+            w = torch.matmul(self.p, torch.matmul(l, u))
+        else:
+            l = torch.inverse(l.double()).float()
+            u = torch.inverse(u.double()).float()
+            w = torch.matmul(u, torch.matmul(l, self.p.inverse()))
+        return w.view(w_shape[0], w_shape[1], 1), dlogdet
+    def forward(self, x, x_mask=None, reverse=False, **kwargs):
+        """
+        log-det = log|abs(|W|)| * pixels
+        """
+        b, c, t = x.size()
+        if x_mask is None:
+            x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t
+        else:
+            x_len = torch.sum(x_mask, [1, 2])
+        logdet = 0
+        if not reverse:
+            weight, dlogdet = self.get_weight(x.device, reverse)
+            z = F.conv1d(x, weight)
+            if logdet is not None:
+                logdet = logdet + dlogdet * x_len
+            return z, logdet
+        else:
+            if self.weight is None:
+                weight, dlogdet = self.get_weight(x.device, reverse)
+            else:
+                weight, dlogdet = self.weight, self.dlogdet
+            z = F.conv1d(x, weight)
+            if logdet is not None:
+                logdet = logdet - dlogdet * x_len
+            return z, logdet
+    def store_inverse(self):
+        self.weight, self.dlogdet = self.get_weight('cuda', reverse=True)
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+        return x, logdet
+    def store_inverse(self):
+        pass
+class CouplingBlock(nn.Module):
+    def __init__(self, in_channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                 gin_channels=0, p_dropout=0, sigmoid_scale=False,
+                 share_cond_layers=False, wn=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.sigmoid_scale = sigmoid_scale
+        start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1)
+        start = torch.nn.utils.weight_norm(start)
+        self.start = start
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(hidden_channels, in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+        self.wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels,
+                     p_dropout, share_cond_layers)
+        if wn is not None:
+            self.wn.in_layers = wn.in_layers
+            self.wn.res_skip_layers = wn.res_skip_layers
+    def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
+        if x_mask is None:
+            x_mask = 1
+        x_0, x_1 = x[:, :self.in_channels // 2], x[:, self.in_channels // 2:]
+        x = self.start(x_0) * x_mask
+        x = self.wn(x, x_mask, g)
+        out = self.end(x)
+        z_0 = x_0
+        m = out[:, :self.in_channels // 2, :]
+        logs = out[:, self.in_channels // 2:, :]
+        if self.sigmoid_scale:
+            logs = torch.log(1e-6 + torch.sigmoid(logs + 2))
+        if reverse:
+            z_1 = (x_1 - m) * torch.exp(-logs) * x_mask
+            logdet = torch.sum(-logs * x_mask, [1, 2])
+        else:
+            z_1 = (m + torch.exp(logs) * x_1) * x_mask
+            logdet = torch.sum(logs * x_mask, [1, 2])
+        z = torch.cat([z_0, z_1], 1)
+        return z, logdet
+    def store_inverse(self):
+        self.wn.remove_weight_norm()
+class GlowFFTBlocks(FFTBlocks):
+    def __init__(self, hidden_size=128, gin_channels=256, num_layers=2, ffn_kernel_size=5,
+                 dropout=None, num_heads=4, use_pos_embed=True, use_last_norm=True,
+                 norm='ln', use_pos_embed_alpha=True):
+        super().__init__(hidden_size, num_layers, ffn_kernel_size, dropout, num_heads, use_pos_embed,
+                         use_last_norm, norm, use_pos_embed_alpha)
+        self.inp_proj = nn.Conv1d(hidden_size + gin_channels, hidden_size, 1)
+    def forward(self, x, x_mask=None, g=None):
+        """
+        :param x: [B, C_x, T]
+        :param x_mask: [B, 1, T]
+        :param g: [B, C_g, T]
+        :return: [B, C_x, T]
+        """
+        if g is not None:
+            x = self.inp_proj(torch.cat([x, g], 1))
+        x = x.transpose(1, 2)
+        x = super(GlowFFTBlocks, self).forward(x, x_mask[:, 0] == 0)
+        x = x.transpose(1, 2)
+        return x
+class TransformerCouplingBlock(nn.Module):
+    def __init__(self, in_channels, hidden_channels, n_layers,
+                 gin_channels=0, p_dropout=0, sigmoid_scale=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.sigmoid_scale = sigmoid_scale
+        start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1)
+        self.start = start
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(hidden_channels, in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+        self.fft_blocks = GlowFFTBlocks(
+            hidden_size=hidden_channels,
+            ffn_kernel_size=3,
+            gin_channels=gin_channels,
+            num_layers=n_layers)
+    def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
+        if x_mask is None:
+            x_mask = 1
+        x_0, x_1 = x[:, :self.in_channels // 2], x[:, self.in_channels // 2:]
+        x = self.start(x_0) * x_mask
+        x = self.fft_blocks(x, x_mask, g)
+        out = self.end(x)
+        z_0 = x_0
+        m = out[:, :self.in_channels // 2, :]
+        logs = out[:, self.in_channels // 2:, :]
+        if self.sigmoid_scale:
+            logs = torch.log(1e-6 + torch.sigmoid(logs + 2))
+        if reverse:
+            z_1 = (x_1 - m) * torch.exp(-logs) * x_mask
+            logdet = torch.sum(-logs * x_mask, [1, 2])
+        else:
+            z_1 = (m + torch.exp(logs) * x_1) * x_mask
+            logdet = torch.sum(logs * x_mask, [1, 2])
+        z = torch.cat([z_0, z_1], 1)
+        return z, logdet
+    def store_inverse(self):
+        pass
+class FreqFFTCouplingBlock(nn.Module):
+    def __init__(self, in_channels, hidden_channels, n_layers,
+                 gin_channels=0, p_dropout=0, sigmoid_scale=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.sigmoid_scale = sigmoid_scale
+        hs = hidden_channels
+        stride = 8
+        self.start = torch.nn.Conv2d(3, hs, kernel_size=stride * 2,
+                                     stride=stride, padding=stride // 2)
+        end = nn.ConvTranspose2d(hs, 2, kernel_size=stride, stride=stride)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = nn.Sequential(
+            nn.Conv2d(hs * 3, hs, 3, 1, 1),
+            nn.ReLU(),
+            nn.GroupNorm(4, hs),
+            nn.Conv2d(hs, hs, 3, 1, 1),
+            end
+        )
+        self.fft_v = FFTBlocks(hidden_size=hs, ffn_kernel_size=1, num_layers=n_layers)
+        self.fft_h = nn.Sequential(
+            nn.Conv1d(hs, hs, 3, 1, 1),
+            nn.ReLU(),
+            nn.Conv1d(hs, hs, 3, 1, 1),
+        )
+        self.fft_g = nn.Sequential(
+            nn.Conv1d(
+                gin_channels - 160, hs, kernel_size=stride * 2, stride=stride, padding=stride // 2),
+            Permute(0, 2, 1),
+            FFTBlocks(hidden_size=hs, ffn_kernel_size=1, num_layers=n_layers),
+            Permute(0, 2, 1),
+        )
+    def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
+        g_, _ = utils.unsqueeze(g)
+        g_mel = g_[:, :80]
+        g_txt = g_[:, 80:]
+        g_mel, _ = utils.squeeze(g_mel)
+        g_txt, _ = utils.squeeze(g_txt)  # [B, C, T]
+        if x_mask is None:
+            x_mask = 1
+        x_0, x_1 = x[:, :self.in_channels // 2], x[:, self.in_channels // 2:]
+        x = torch.stack([x_0, g_mel[:, :80], g_mel[:, 80:]], 1)
+        x = self.start(x)  # [B, C, N_bins, T]
+        B, C, N_bins, T = x.shape
+        x_v = self.fft_v(x.permute(0, 3, 2, 1).reshape(B * T, N_bins, C))
+        x_v = x_v.reshape(B, T, N_bins, -1).permute(0, 3, 2, 1)
+        # x_v = x
+        x_h = self.fft_h(x.permute(0, 2, 1, 3).reshape(B * N_bins, C, T))
+        x_h = x_h.reshape(B, N_bins, -1, T).permute(0, 2, 1, 3)
+        # x_h = x
+        x_g = self.fft_g(g_txt)[:, :, None, :].repeat(1, 1, 10, 1)
+        x = torch.cat([x_v, x_h, x_g], 1)
+        out = self.end(x)
+        z_0 = x_0
+        m = out[:, 0]
+        logs = out[:, 1]
+        if self.sigmoid_scale:
+            logs = torch.log(1e-6 + torch.sigmoid(logs + 2))
+        if reverse:
+            z_1 = (x_1 - m) * torch.exp(-logs) * x_mask
+            logdet = torch.sum(-logs * x_mask, [1, 2])
+        else:
+            z_1 = (m + torch.exp(logs) * x_1) * x_mask
+            logdet = torch.sum(logs * x_mask, [1, 2])
+        z = torch.cat([z_0, z_1], 1)
+        return z, logdet
+    def store_inverse(self):
+        pass
+class ResidualCouplingLayer(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 p_dropout=0,
+                 gin_channels=0,
+                 mean_only=False,
+                 nn_type='wn'):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        if nn_type == 'wn':
+            self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
+                          gin_channels=gin_channels)
+        # elif nn_type == 'conv':
+        #     self.enc = ConditionalConvBlocks(
+        #         hidden_channels, gin_channels, hidden_channels, [1] * n_layers, kernel_size,
+        #         layers_in_block=1, is_BTC=False)
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask=x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = -torch.sum(logs, [1, 2])
+            return x, logdet
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 n_flows=4,
+                 gin_channels=0,
+                 nn_type='wn'):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                                      gin_channels=gin_channels, mean_only=True, nn_type=nn_type))
+            self.flows.append(Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class Glow(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_blocks,
+                 n_layers,
+                 p_dropout=0.,
+                 n_split=4,
+                 n_sqz=2,
+                 sigmoid_scale=False,
+                 gin_channels=0,
+                 inv_conv_type='near',
+                 share_cond_layers=False,
+                 share_wn_layers=0,
+                 ):
+        super().__init__()
+        """
+        Note that regularization likes weight decay can leads to Nan error!
+        """
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_blocks = n_blocks
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.n_split = n_split
+        self.n_sqz = n_sqz
+        self.sigmoid_scale = sigmoid_scale
+        self.gin_channels = gin_channels
+        self.share_cond_layers = share_cond_layers
+        if gin_channels != 0 and share_cond_layers:
+            cond_layer = torch.nn.Conv1d(gin_channels * n_sqz, 2 * hidden_channels * n_layers, 1)
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+        wn = None
+        self.flows = nn.ModuleList()
+        for b in range(n_blocks):
+            self.flows.append(ActNorm(channels=in_channels * n_sqz))
+            if inv_conv_type == 'near':
+                self.flows.append(InvConvNear(channels=in_channels * n_sqz, n_split=n_split, n_sqz=n_sqz))
+            if inv_conv_type == 'invconv':
+                self.flows.append(InvConv(channels=in_channels * n_sqz))
+            if share_wn_layers > 0:
+                if b % share_wn_layers == 0:
+                    wn = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels * n_sqz,
+                            p_dropout, share_cond_layers)
+            self.flows.append(
+                CouplingBlock(
+                    in_channels * n_sqz,
+                    hidden_channels,
+                    kernel_size=kernel_size,
+                    dilation_rate=dilation_rate,
+                    n_layers=n_layers,
+                    gin_channels=gin_channels * n_sqz,
+                    p_dropout=p_dropout,
+                    sigmoid_scale=sigmoid_scale,
+                    share_cond_layers=share_cond_layers,
+                    wn=wn
+                ))
+    def forward(self, x, x_mask=None, g=None, reverse=False, return_hiddens=False):
+        """
+        x: [B,T,C]
+        x_mask: [B,T]
+        g: [B,T,C]
+        """
+        x = x.transpose(1,2)
+        x_mask = x_mask.unsqueeze(1)
+        if g is not None:
+            g = g.transpose(1,2)
+        logdet_tot = 0
+        if not reverse:
+            flows = self.flows
+        else:
+            flows = reversed(self.flows)
+        if return_hiddens:
+            hs = []
+        if self.n_sqz > 1:
+            x, x_mask_ = utils.squeeze(x, x_mask, self.n_sqz)
+            if g is not None:
+                g, _ = utils.squeeze(g, x_mask, self.n_sqz)
+            x_mask = x_mask_
+        if self.share_cond_layers and g is not None:
+            g = self.cond_layer(g)
+        for f in flows:
+            x, logdet = f(x, x_mask, g=g, reverse=reverse)
+            if return_hiddens:
+                hs.append(x)
+            logdet_tot += logdet
+        if self.n_sqz > 1:
+            x, x_mask = utils.unsqueeze(x, x_mask, self.n_sqz)
+        x = x.transpose(1,2)
+        if return_hiddens:
+            return x, logdet_tot, hs
+        return x, logdet_tot
+    def store_inverse(self):
+        def remove_weight_norm(m):
+            try:
+                nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(remove_weight_norm)
+        for f in self.flows:
+            f.store_inverse()
+if __name__ == '__main__':
+    model = Glow(in_channels=64,
+                hidden_channels=128,
+                kernel_size=5,
+                dilation_rate=1,
+                n_blocks=12,
+                n_layers=4,
+                p_dropout=0.0,
+                n_split=4,
+                n_sqz=2,
+                sigmoid_scale=False,
+                gin_channels=80
+                )
+    exp = torch.rand([1,1440,64])
+    mel = torch.rand([1,1440,80])
+    x_mask = torch.ones([1,1440],dtype=torch.float32)
+    y, logdet = model(exp, x_mask,g=mel, reverse=False)
+    pred_exp, logdet = model(y, x_mask,g=mel, reverse=False)
+    # y: [b, t,c=64]
+    print(" ")

modules/audio2motion/multi_length_disc.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import numpy as np
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from modules.audio2motion.cnn_models import LambdaLayer
+class Discriminator1DFactory(nn.Module):
+    def __init__(self, time_length, kernel_size=3, in_dim=1, hidden_size=128, norm_type='bn'):
+        super(Discriminator1DFactory, self).__init__()
+        padding = kernel_size // 2
+        def discriminator_block(in_filters, out_filters, first=False):
+            """
+            Input: (B, c, T)
+            Output:(B, c, T//2)
+            """
+            conv = nn.Conv1d(in_filters, out_filters, kernel_size, 2, padding)
+            block = [
+                conv,  # padding = kernel//2
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Dropout2d(0.25)
+            ]
+            if norm_type == 'bn' and not first:
+                block.append(nn.BatchNorm1d(out_filters, 0.8))
+            if norm_type == 'in' and not first:
+                block.append(nn.InstanceNorm1d(out_filters, affine=True))
+            block = nn.Sequential(*block)
+            return block
+        if time_length >= 8:
+            self.model = nn.ModuleList([
+                discriminator_block(in_dim, hidden_size, first=True),
+                discriminator_block(hidden_size, hidden_size),
+                discriminator_block(hidden_size, hidden_size),
+            ])
+            ds_size = time_length // (2 ** 3)
+        elif time_length == 3:
+            self.model = nn.ModuleList([
+                nn.Sequential(*[
+                    nn.Conv1d(in_dim, hidden_size, 3, 1, 0),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    nn.Dropout2d(0.25),
+                    nn.Conv1d(hidden_size, hidden_size, 1, 1, 0),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    nn.Dropout2d(0.25),
+                    nn.BatchNorm1d(hidden_size, 0.8),
+                    nn.Conv1d(hidden_size, hidden_size, 1, 1, 0),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    nn.Dropout2d(0.25),
+                    nn.BatchNorm1d(hidden_size, 0.8)
+                ])
+            ])
+            ds_size = 1
+        elif time_length == 1:
+            self.model = nn.ModuleList([
+                nn.Sequential(*[
+                    nn.Linear(in_dim, hidden_size),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    nn.Dropout2d(0.25),
+                    nn.Linear(hidden_size, hidden_size),
+                    nn.LeakyReLU(0.2, inplace=True),
+                    nn.Dropout2d(0.25),
+                ])
+            ])
+            ds_size = 1
+        self.adv_layer = nn.Linear(hidden_size * ds_size, 1)
+    def forward(self, x):
+        """
+        :param x: [B, C, T]
+        :return: validity: [B, 1], h: List of hiddens
+        """
+        h = []
+        if x.shape[-1] == 1:
+            x = x.squeeze(-1)
+        for l in self.model:
+            x = l(x)
+            h.append(x)
+        if x.ndim == 2:
+            b, ct = x.shape
+            use_sigmoid = True
+        else:
+            b, c, t = x.shape
+            ct = c * t
+            use_sigmoid = False
+        x = x.view(b, ct)
+        validity = self.adv_layer(x)  # [B, 1]
+        if use_sigmoid:
+            validity = torch.sigmoid(validity)
+        return validity, h
+class CosineDiscriminator1DFactory(nn.Module):
+    def __init__(self, time_length, kernel_size=3, in_dim=1, hidden_size=128, norm_type='bn'):
+        super().__init__()
+        padding = kernel_size // 2
+        def discriminator_block(in_filters, out_filters, first=False):
+            """
+            Input: (B, c, T)
+            Output:(B, c, T//2)
+            """
+            conv = nn.Conv1d(in_filters, out_filters, kernel_size, 2, padding)
+            block = [
+                conv,  # padding = kernel//2
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.Dropout2d(0.25)
+            ]
+            if norm_type == 'bn' and not first:
+                block.append(nn.BatchNorm1d(out_filters, 0.8))
+            if norm_type == 'in' and not first:
+                block.append(nn.InstanceNorm1d(out_filters, affine=True))
+            block = nn.Sequential(*block)
+            return block
+        self.model1 = nn.ModuleList([
+            discriminator_block(in_dim, hidden_size, first=True),
+            discriminator_block(hidden_size, hidden_size),
+            discriminator_block(hidden_size, hidden_size),
+        ])
+        self.model2 = nn.ModuleList([
+            discriminator_block(in_dim, hidden_size, first=True),
+            discriminator_block(hidden_size, hidden_size),
+            discriminator_block(hidden_size, hidden_size),
+        ])
+        self.relu = nn.ReLU()
+    def forward(self, x1, x2):
+        """
+        :param x1: [B, C, T]
+        :param x2: [B, C, T]
+        :return: validity: [B, 1], h: List of hiddens
+        """
+        h1, h2 = [], []
+        for l in self.model1:
+            x1 = l(x1)
+            h1.append(x1)
+        for l in self.model2:
+            x2 = l(x2)
+            h2.append(x1)
+        b,c,t = x1.shape
+        x1 = x1.view(b, c*t)
+        x2 = x2.view(b, c*t)
+        x1 = self.relu(x1)
+        x2 = self.relu(x2)
+        # x1 = F.normalize(x1, p=2, dim=1)
+        # x2 = F.normalize(x2, p=2, dim=1)
+        validity = F.cosine_similarity(x1, x2)
+        return validity, [h1,h2]
+class MultiWindowDiscriminator(nn.Module):
+    def __init__(self, time_lengths, cond_dim=80, in_dim=64, kernel_size=3, hidden_size=128, disc_type='standard', norm_type='bn', reduction='sum'):
+        super(MultiWindowDiscriminator, self).__init__()
+        self.win_lengths = time_lengths
+        self.reduction = reduction
+        self.disc_type = disc_type
+        if cond_dim > 0:
+            self.use_cond = True
+            self.cond_proj_layers = nn.ModuleList()
+            self.in_proj_layers = nn.ModuleList()
+        else:
+            self.use_cond = False
+        self.conv_layers = nn.ModuleList()
+        for time_length in time_lengths:
+            conv_layer = [
+                Discriminator1DFactory(
+                    time_length, kernel_size, in_dim=64, hidden_size=hidden_size,
+                    norm_type=norm_type) if self.disc_type == 'standard'
+                else CosineDiscriminator1DFactory(time_length, kernel_size, in_dim=64,
+                    hidden_size=hidden_size,norm_type=norm_type)
+            ]
+            self.conv_layers += conv_layer
+            if self.use_cond:
+                self.cond_proj_layers.append(nn.Linear(cond_dim, 64))
+                self.in_proj_layers.append(nn.Linear(in_dim, 64))
+    def clip(self, x, cond, x_len, win_length, start_frames=None):
+        '''Ramdom clip x to win_length.
+        Args:
+            x (tensor) : (B,  T, C).
+            cond (tensor) : (B, T, H).
+            x_len (tensor) : (B,).
+            win_length (int): target clip length
+        Returns:
+            (tensor) : (B, c_in, win_length, n_bins).
+        '''
+        clip_from_same_frame = start_frames is None
+        T_start = 0
+        # T_end = x_len.max() - win_length
+        T_end = x_len.min() - win_length
+        if T_end < 0:
+            return None, None, start_frames
+        T_end = T_end.item()
+        if start_frames is None:
+            start_frame = np.random.randint(low=T_start, high=T_end + 1)
+            start_frames = [start_frame] * x.size(0)
+        else:
+            start_frame = start_frames[0]
+        if clip_from_same_frame:
+            x_batch = x[:, start_frame: start_frame + win_length, :]
+            c_batch = cond[:, start_frame: start_frame + win_length, :] if cond is not None else None
+        else:
+            x_lst = []
+            c_lst = []
+            for i, start_frame in enumerate(start_frames):
+                x_lst.append(x[i, start_frame: start_frame + win_length, :])
+                if cond is not None:
+                    c_lst.append(cond[i, start_frame: start_frame + win_length, :])
+            x_batch = torch.stack(x_lst, dim=0)
+            if cond is None:
+                c_batch = None
+            else:
+                c_batch = torch.stack(c_lst, dim=0)
+        return x_batch, c_batch, start_frames
+    def forward(self, x, x_len, cond=None, start_frames_wins=None):
+        '''
+        Args:
+            x (tensor): input mel, (B, T, C).
+            x_length (tensor): len of per mel. (B,).
+        Returns:
+            tensor : (B).
+        '''
+        validity = []
+        if start_frames_wins is None:
+            start_frames_wins = [None] * len(self.conv_layers)
+        h = []
+        for i, start_frames in zip(range(len(self.conv_layers)), start_frames_wins):
+            x_clip, c_clip, start_frames = self.clip(
+                x, cond, x_len, self.win_lengths[i], start_frames)  # (B, win_length, C)
+            start_frames_wins[i] = start_frames
+            if x_clip is None:
+                continue
+            if self.disc_type == 'standard':
+                if self.use_cond:
+                    x_clip = self.in_proj_layers[i](x_clip)  # (B, T, C)
+                    c_clip = self.cond_proj_layers[i](c_clip)
+                    x_clip = x_clip + c_clip
+                validity_pred, h_ = self.conv_layers[i](x_clip.transpose(1,2))
+            elif self.disc_type == 'cosine':
+                assert self.use_cond is True
+                x_clip = self.in_proj_layers[i](x_clip)  # (B, T, C)
+                c_clip = self.cond_proj_layers[i](c_clip)
+                validity_pred, h_ = self.conv_layers[i](x_clip.transpose(1,2), c_clip.transpose(1,2))
+            else:
+                raise NotImplementedError
+            h += h_
+            validity.append(validity_pred)
+        if len(validity) != len(self.conv_layers):
+            return None, start_frames_wins, h
+        if self.reduction == 'sum':
+            validity = sum(validity)  # [B]
+        elif self.reduction == 'stack':
+            validity = torch.stack(validity, -1)  # [B, W_L]
+        return validity, start_frames_wins, h
+class Discriminator(nn.Module):
+    def __init__(self, x_dim=80, y_dim=64, disc_type='standard',
+                uncond_disc=False, kernel_size=3, hidden_size=128, norm_type='bn', reduction='sum', time_lengths=(8,16,32)):
+        """_summary_
+        Args:
+            time_lengths (list, optional): the list of  window size. Defaults to [32, 64, 128].
+            x_dim (int, optional): the dim of audio features. Defaults to 80, corresponding to mel-spec.
+            y_dim (int, optional): the dim of facial coeff. Defaults to 64, correspond to exp; other options can be 7(pose) or 71(exp+pose).
+            kernel (tuple, optional): _description_. Defaults to (3, 3).
+            c_in (int, optional): _description_. Defaults to 1.
+            hidden_size (int, optional): _description_. Defaults to 128.
+            norm_type (str, optional): _description_. Defaults to 'bn'.
+            reduction (str, optional): _description_. Defaults to 'sum'.
+            uncond_disc (bool, optional): _description_. Defaults to False.
+        """
+        super(Discriminator, self).__init__()
+        self.time_lengths = time_lengths
+        self.x_dim, self.y_dim = x_dim, y_dim
+        self.disc_type = disc_type
+        self.reduction = reduction
+        self.uncond_disc = uncond_disc
+        if uncond_disc:
+            self.x_dim = 0
+            cond_dim = 0
+        else:
+            cond_dim = 64
+            self.mel_encoder = nn.Sequential(*[
+                    nn.Conv1d(self.x_dim, 64, 3, 1, 1, bias=False),
+                    nn.BatchNorm1d(64),
+                    nn.GELU(),
+                    nn.Conv1d(64, cond_dim, 3, 1, 1, bias=False)
+                ])
+        self.disc = MultiWindowDiscriminator(
+            time_lengths=self.time_lengths,
+            in_dim=self.y_dim,
+            cond_dim=cond_dim,
+            kernel_size=kernel_size,
+            hidden_size=hidden_size, norm_type=norm_type,
+            reduction=reduction,
+            disc_type=disc_type
+        )
+        self.downsampler = LambdaLayer(lambda x: F.interpolate(x.transpose(1,2), scale_factor=0.5, mode='nearest').transpose(1,2))
+    @property
+    def device(self):
+        return self.disc.parameters().__next__().device
+    def forward(self,x, batch, start_frames_wins=None):
+        """
+        :param x: [B, T, C]
+        :param cond: [B, T, cond_size]
+        :return:
+        """
+        x = x.to(self.device)
+        if not self.uncond_disc:
+            mel = self.downsampler(batch['mel'].to(self.device))
+            mel_feat = self.mel_encoder(mel.transpose(1,2)).transpose(1,2)
+        else:
+            mel_feat = None
+        x_len = x.sum(-1).ne(0).int().sum([1])
+        disc_confidence, start_frames_wins, h = self.disc(x, x_len, mel_feat, start_frames_wins=start_frames_wins)
+        return disc_confidence

modules/audio2motion/transformer_base.py ADDED Viewed

	@@ -0,0 +1,988 @@

+import math
+import torch
+from torch import nn
+from torch.nn import Parameter
+import torch.onnx.operators
+import torch.nn.functional as F
+from collections import defaultdict
+def make_positions(tensor, padding_idx):
+    """Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols are ignored.
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA. In particular XLA
+    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+    # how to handle the dtype kwarg in cumsum.
+    mask = tensor.ne(padding_idx).int()
+    return (
+                   torch.cumsum(mask, dim=1).type_as(mask) * mask
+           ).long() + padding_idx
+def softmax(x, dim):
+    return F.softmax(x, dim=dim, dtype=torch.float32)
+INCREMENTAL_STATE_INSTANCE_ID = defaultdict(lambda: 0)
+def _get_full_incremental_state_key(module_instance, key):
+    module_name = module_instance.__class__.__name__
+    # assign a unique ID to each module instance, so that incremental state is
+    # not shared across module instances
+    if not hasattr(module_instance, '_instance_id'):
+        INCREMENTAL_STATE_INSTANCE_ID[module_name] += 1
+        module_instance._instance_id = INCREMENTAL_STATE_INSTANCE_ID[module_name]
+    return '{}.{}.{}'.format(module_name, module_instance._instance_id, key)
+def get_incremental_state(module, incremental_state, key):
+    """Helper for getting incremental state for an nn.Module."""
+    full_key = _get_full_incremental_state_key(module, key)
+    if incremental_state is None or full_key not in incremental_state:
+        return None
+    return incremental_state[full_key]
+def set_incremental_state(module, incremental_state, key, value):
+    """Helper for setting incremental state for an nn.Module."""
+    if incremental_state is not None:
+        full_key = _get_full_incremental_state_key(module, key)
+        incremental_state[full_key] = value
+class Reshape(nn.Module):
+    def __init__(self, *args):
+        super(Reshape, self).__init__()
+        self.shape = args
+    def forward(self, x):
+        return x.view(self.shape)
+class Permute(nn.Module):
+    def __init__(self, *args):
+        super(Permute, self).__init__()
+        self.args = args
+    def forward(self, x):
+        return x.permute(self.args)
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert (kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+def Embedding(num_embeddings, embedding_dim, padding_idx=None):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    if padding_idx is not None:
+        nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+class GroupNorm1DTBC(nn.GroupNorm):
+    def forward(self, input):
+        return super(GroupNorm1DTBC, self).forward(input.permute(1, 2, 0)).permute(2, 0, 1)
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
+    if not export and torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+    Padding symbols are ignored.
+    """
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.weights = SinusoidalPositionalEmbedding.get_embedding(
+            init_size,
+            embedding_dim,
+            padding_idx,
+        )
+        self.register_buffer('_float_tensor', torch.FloatTensor(1))
+    @staticmethod
+    def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
+        """Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+    def forward(self, input, incremental_state=None, timestep=None, positions=None, **kwargs):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input.shape[:2]
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.size(0):
+            # recompute/expand embeddings if needed
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos,
+                self.embedding_dim,
+                self.padding_idx,
+            )
+        self.weights = self.weights.to(self._float_tensor)
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
+        positions = make_positions(input, self.padding_idx) if positions is None else positions
+        return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
+    def max_positions(self):
+        """Maximum number of supported positions."""
+        return int(1e5)  # an arbitrary large number
+class ConvTBC(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
+        super(ConvTBC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.weight = torch.nn.Parameter(torch.Tensor(
+            self.kernel_size, in_channels, out_channels))
+        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+    def forward(self, input):
+        return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding)
+class MultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
+                 add_bias_kv=False, add_zero_attn=False, self_attention=False,
+                 encoder_decoder_attention=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
+                                                             'value to be of the same size'
+        if self.qkv_same_dim:
+            self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
+        else:
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+        if bias:
+            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+        self.enable_torch_version = False
+        if hasattr(F, "multi_head_attention_forward"):
+            self.enable_torch_version = True
+        else:
+            self.enable_torch_version = False
+        self.last_attn_probs = None
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            nn.init.xavier_uniform_(self.in_proj_weight)
+        else:
+            nn.init.xavier_uniform_(self.k_proj_weight)
+            nn.init.xavier_uniform_(self.v_proj_weight)
+            nn.init.xavier_uniform_(self.q_proj_weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.in_proj_bias is not None:
+            nn.init.constant_(self.in_proj_bias, 0.)
+            nn.init.constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(
+            self,
+            query, key, value,
+            key_padding_mask=None,
+            incremental_state=None,
+            need_weights=True,
+            static_kv=False,
+            attn_mask=None,
+            before_softmax=False,
+            need_head_weights=False,
+            enc_dec_attn_constraint_mask=None,
+            reset_attn_weight=None
+    ):
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if self.enable_torch_version and incremental_state is None and not static_kv and reset_attn_weight is None:
+            if self.qkv_same_dim:
+                return F.multi_head_attention_forward(query, key, value,
+                                                      self.embed_dim, self.num_heads,
+                                                      self.in_proj_weight,
+                                                      self.in_proj_bias, self.bias_k, self.bias_v,
+                                                      self.add_zero_attn, self.dropout,
+                                                      self.out_proj.weight, self.out_proj.bias,
+                                                      self.training, key_padding_mask, need_weights,
+                                                      attn_mask)
+            else:
+                return F.multi_head_attention_forward(query, key, value,
+                                                      self.embed_dim, self.num_heads,
+                                                      torch.empty([0]),
+                                                      self.in_proj_bias, self.bias_k, self.bias_v,
+                                                      self.add_zero_attn, self.dropout,
+                                                      self.out_proj.weight, self.out_proj.bias,
+                                                      self.training, key_padding_mask, need_weights,
+                                                      attn_mask, use_separate_proj_weight=True,
+                                                      q_proj_weight=self.q_proj_weight,
+                                                      k_proj_weight=self.k_proj_weight,
+                                                      v_proj_weight=self.v_proj_weight)
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if 'prev_key' in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            # self-attention
+            q, k, v = self.in_proj_qkv(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.in_proj_q(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.in_proj_k(key)
+                v = self.in_proj_v(key)
+        else:
+            q = self.in_proj_q(query)
+            k = self.in_proj_k(key)
+            v = self.in_proj_v(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if 'prev_key' in saved_state:
+                prev_key = saved_state['prev_key'].view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    k = torch.cat((prev_key, k), dim=1)
+            if 'prev_value' in saved_state:
+                prev_value = saved_state['prev_value'].view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    v = torch.cat((prev_value, v), dim=1)
+            if 'prev_key_padding_mask' in saved_state and saved_state['prev_key_padding_mask'] is not None:
+                prev_key_padding_mask = saved_state['prev_key_padding_mask']
+                if static_kv:
+                    key_padding_mask = prev_key_padding_mask
+                else:
+                    key_padding_mask = torch.cat((prev_key_padding_mask, key_padding_mask), dim=1)
+            saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state['prev_key_padding_mask'] = key_padding_mask
+            self._set_input_buffer(incremental_state, saved_state)
+        src_len = k.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.shape == torch.Size([]):
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            if len(attn_mask.shape) == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+            elif len(attn_mask.shape) == 3:
+                attn_mask = attn_mask[:, None].repeat([1, self.num_heads, 1, 1]).reshape(
+                    bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights + attn_mask
+        if enc_dec_attn_constraint_mask is not None:  # bs x head x L_kv
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                enc_dec_attn_constraint_mask.unsqueeze(2).bool(),
+                -1e8,
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                -1e8,
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_logits = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
+        if reset_attn_weight is not None:
+            if reset_attn_weight:
+                self.last_attn_probs = attn_probs.detach()
+            else:
+                assert self.last_attn_probs is not None
+                attn_probs = self.last_attn_probs
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        else:
+            attn_weights = None
+        return attn, (attn_weights, attn_logits)
+    def in_proj_qkv(self, query):
+        return self._in_proj(query).chunk(3, dim=-1)
+    def in_proj_q(self, query):
+        if self.qkv_same_dim:
+            return self._in_proj(query, end=self.embed_dim)
+        else:
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[:self.embed_dim]
+            return F.linear(query, self.q_proj_weight, bias)
+    def in_proj_k(self, key):
+        if self.qkv_same_dim:
+            return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
+        else:
+            weight = self.k_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[self.embed_dim:2 * self.embed_dim]
+            return F.linear(key, weight, bias)
+    def in_proj_v(self, value):
+        if self.qkv_same_dim:
+            return self._in_proj(value, start=2 * self.embed_dim)
+        else:
+            weight = self.v_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[2 * self.embed_dim:]
+            return F.linear(value, weight, bias)
+    def _in_proj(self, input, start=0, end=None):
+        weight = self.in_proj_weight
+        bias = self.in_proj_bias
+        weight = weight[start:end, :]
+        if bias is not None:
+            bias = bias[start:end]
+        return F.linear(input, weight, bias)
+    def _get_input_buffer(self, incremental_state):
+        return get_incremental_state(
+            self,
+            incremental_state,
+            'attn_state',
+        ) or {}
+    def _set_input_buffer(self, incremental_state, buffer):
+        set_incremental_state(
+            self,
+            incremental_state,
+            'attn_state',
+            buffer,
+        )
+    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
+        return attn_weights
+    def clear_buffer(self, incremental_state=None):
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if 'prev_key' in saved_state:
+                del saved_state['prev_key']
+            if 'prev_value' in saved_state:
+                del saved_state['prev_value']
+            self._set_input_buffer(incremental_state, saved_state)
+class Swish(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_variables[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+class CustomSwish(nn.Module):
+    def forward(self, input_tensor):
+        return Swish.apply(input_tensor)
+class TransformerFFNLayer(nn.Module):
+    def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.dropout = dropout
+        self.act = act
+        if padding == 'SAME':
+            self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
+        elif padding == 'LEFT':
+            self.ffn_1 = nn.Sequential(
+                nn.ConstantPad1d((kernel_size - 1, 0), 0.0),
+                nn.Conv1d(hidden_size, filter_size, kernel_size)
+            )
+        self.ffn_2 = Linear(filter_size, hidden_size)
+        if self.act == 'swish':
+            self.swish_fn = CustomSwish()
+    def forward(self, x, incremental_state=None):
+        # x: T x B x C
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if 'prev_input' in saved_state:
+                prev_input = saved_state['prev_input']
+                x = torch.cat((prev_input, x), dim=0)
+            x = x[-self.kernel_size:]
+            saved_state['prev_input'] = x
+            self._set_input_buffer(incremental_state, saved_state)
+        x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1)
+        x = x * self.kernel_size ** -0.5
+        if incremental_state is not None:
+            x = x[-1:]
+        if self.act == 'gelu':
+            x = F.gelu(x)
+        if self.act == 'relu':
+            x = F.relu(x)
+        if self.act == 'swish':
+            x = self.swish_fn(x)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = self.ffn_2(x)
+        return x
+    def _get_input_buffer(self, incremental_state):
+        return get_incremental_state(
+            self,
+            incremental_state,
+            'f',
+        ) or {}
+    def _set_input_buffer(self, incremental_state, buffer):
+        set_incremental_state(
+            self,
+            incremental_state,
+            'f',
+            buffer,
+        )
+    def clear_buffer(self, incremental_state):
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if 'prev_input' in saved_state:
+                del saved_state['prev_input']
+            self._set_input_buffer(incremental_state, saved_state)
+class BatchNorm1dTBC(nn.Module):
+    def __init__(self, c):
+        super(BatchNorm1dTBC, self).__init__()
+        self.bn = nn.BatchNorm1d(c)
+    def forward(self, x):
+        """
+        :param x: [T, B, C]
+        :return: [T, B, C]
+        """
+        x = x.permute(1, 2, 0)  # [B, C, T]
+        x = self.bn(x)  # [B, C, T]
+        x = x.permute(2, 0, 1)  # [T, B, C]
+        return x
+class EncSALayer(nn.Module):
+    def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
+                 relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'):
+        super().__init__()
+        self.c = c
+        self.dropout = dropout
+        self.num_heads = num_heads
+        if num_heads > 0:
+            if norm == 'ln':
+                self.layer_norm1 = LayerNorm(c)
+            elif norm == 'bn':
+                self.layer_norm1 = BatchNorm1dTBC(c)
+            elif norm == 'gn':
+                self.layer_norm1 = GroupNorm1DTBC(8, c)
+            self.self_attn = MultiheadAttention(
+                self.c, num_heads, self_attention=True, dropout=attention_dropout, bias=False)
+        if norm == 'ln':
+            self.layer_norm2 = LayerNorm(c)
+        elif norm == 'bn':
+            self.layer_norm2 = BatchNorm1dTBC(c)
+        elif norm == 'gn':
+            self.layer_norm2 = GroupNorm1DTBC(8, c)
+        self.ffn = TransformerFFNLayer(
+            c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act)
+    def forward(self, x, encoder_padding_mask=None, **kwargs):
+        layer_norm_training = kwargs.get('layer_norm_training', None)
+        if layer_norm_training is not None:
+            self.layer_norm1.training = layer_norm_training
+            self.layer_norm2.training = layer_norm_training
+        if self.num_heads > 0:
+            residual = x
+            x = self.layer_norm1(x)
+            x, _, = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask
+            )
+            x = F.dropout(x, self.dropout, training=self.training)
+            x = residual + x
+            x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        residual = x
+        x = self.layer_norm2(x)
+        x = self.ffn(x)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        return x
+class DecSALayer(nn.Module):
+    def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1,
+                 kernel_size=9, act='gelu', norm='ln'):
+        super().__init__()
+        self.c = c
+        self.dropout = dropout
+        if norm == 'ln':
+            self.layer_norm1 = LayerNorm(c)
+        elif norm == 'gn':
+            self.layer_norm1 = GroupNorm1DTBC(8, c)
+        self.self_attn = MultiheadAttention(
+            c, num_heads, self_attention=True, dropout=attention_dropout, bias=False
+        )
+        if norm == 'ln':
+            self.layer_norm2 = LayerNorm(c)
+        elif norm == 'gn':
+            self.layer_norm2 = GroupNorm1DTBC(8, c)
+        self.encoder_attn = MultiheadAttention(
+            c, num_heads, encoder_decoder_attention=True, dropout=attention_dropout, bias=False,
+        )
+        if norm == 'ln':
+            self.layer_norm3 = LayerNorm(c)
+        elif norm == 'gn':
+            self.layer_norm3 = GroupNorm1DTBC(8, c)
+        self.ffn = TransformerFFNLayer(
+            c, 4 * c, padding='LEFT', kernel_size=kernel_size, dropout=relu_dropout, act=act)
+    def forward(
+            self,
+            x,
+            encoder_out=None,
+            encoder_padding_mask=None,
+            incremental_state=None,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+            attn_out=None,
+            reset_attn_weight=None,
+            **kwargs,
+    ):
+        layer_norm_training = kwargs.get('layer_norm_training', None)
+        if layer_norm_training is not None:
+            self.layer_norm1.training = layer_norm_training
+            self.layer_norm2.training = layer_norm_training
+            self.layer_norm3.training = layer_norm_training
+        residual = x
+        x = self.layer_norm1(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            attn_mask=self_attn_mask
+        )
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        attn_logits = None
+        if encoder_out is not None or attn_out is not None:
+            residual = x
+            x = self.layer_norm2(x)
+        if encoder_out is not None:
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                enc_dec_attn_constraint_mask=get_incremental_state(self, incremental_state,
+                                                                   'enc_dec_attn_constraint_mask'),
+                reset_attn_weight=reset_attn_weight
+            )
+            attn_logits = attn[1]
+        elif attn_out is not None:
+            x = self.encoder_attn.in_proj_v(attn_out)
+        if encoder_out is not None or attn_out is not None:
+            x = F.dropout(x, self.dropout, training=self.training)
+            x = residual + x
+        residual = x
+        x = self.layer_norm3(x)
+        x = self.ffn(x, incremental_state=incremental_state)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        return x, attn_logits
+    def clear_buffer(self, input, encoder_out=None, encoder_padding_mask=None, incremental_state=None):
+        self.encoder_attn.clear_buffer(incremental_state)
+        self.ffn.clear_buffer(incremental_state)
+    def set_buffer(self, name, tensor, incremental_state):
+        return set_incremental_state(self, incremental_state, name, tensor)
+class ConvBlock(nn.Module):
+    def __init__(self, idim=80, n_chans=256, kernel_size=3, stride=1, norm='gn', dropout=0):
+        super().__init__()
+        self.conv = ConvNorm(idim, n_chans, kernel_size, stride=stride)
+        self.norm = norm
+        if self.norm == 'bn':
+            self.norm = nn.BatchNorm1d(n_chans)
+        elif self.norm == 'in':
+            self.norm = nn.InstanceNorm1d(n_chans, affine=True)
+        elif self.norm == 'gn':
+            self.norm = nn.GroupNorm(n_chans // 16, n_chans)
+        elif self.norm == 'ln':
+            self.norm = LayerNorm(n_chans // 16, n_chans)
+        elif self.norm == 'wn':
+            self.conv = torch.nn.utils.weight_norm(self.conv.conv)
+        self.dropout = nn.Dropout(dropout)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        """
+        :param x: [B, C, T]
+        :return: [B, C, T]
+        """
+        x = self.conv(x)
+        if not isinstance(self.norm, str):
+            if self.norm == 'none':
+                pass
+            elif self.norm == 'ln':
+                x = self.norm(x.transpose(1, 2)).transpose(1, 2)
+            else:
+                x = self.norm(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        return x
+class ConvStacks(nn.Module):
+    def __init__(self, idim=80, n_layers=5, n_chans=256, odim=32, kernel_size=5, norm='gn',
+                 dropout=0, strides=None, res=True):
+        super().__init__()
+        self.conv = torch.nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.res = res
+        self.in_proj = Linear(idim, n_chans)
+        if strides is None:
+            strides = [1] * n_layers
+        else:
+            assert len(strides) == n_layers
+        for idx in range(n_layers):
+            self.conv.append(ConvBlock(
+                n_chans, n_chans, kernel_size, stride=strides[idx], norm=norm, dropout=dropout))
+        self.out_proj = Linear(n_chans, odim)
+    def forward(self, x, return_hiddens=False):
+        """
+        :param x: [B, T, H]
+        :return: [B, T, H]
+        """
+        x = self.in_proj(x)
+        x = x.transpose(1, -1)  # (B, idim, Tmax)
+        hiddens = []
+        for f in self.conv:
+            x_ = f(x)
+            x = x + x_ if self.res else x_  # (B, C, Tmax)
+            hiddens.append(x)
+        x = x.transpose(1, -1)
+        x = self.out_proj(x)  # (B, Tmax, H)
+        if return_hiddens:
+            hiddens = torch.stack(hiddens, 1)  # [B, L, C, T]
+            return x, hiddens
+        return x
+class ConvGlobalStacks(nn.Module):
+    def __init__(self, idim=80, n_layers=5, n_chans=256, odim=32, kernel_size=5, norm='gn', dropout=0,
+                 strides=[2, 2, 2, 2, 2]):
+        super().__init__()
+        self.conv = torch.nn.ModuleList()
+        self.pooling = torch.nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.in_proj = Linear(idim, n_chans)
+        for idx in range(n_layers):
+            self.conv.append(ConvBlock(n_chans, n_chans, kernel_size, stride=strides[idx],
+                                       norm=norm, dropout=dropout))
+            self.pooling.append(nn.MaxPool1d(strides[idx]))
+        self.out_proj = Linear(n_chans, odim)
+    def forward(self, x):
+        """
+        :param x: [B, T, H]
+        :return: [B, T, H]
+        """
+        x = self.in_proj(x)
+        x = x.transpose(1, -1)  # (B, idim, Tmax)
+        for f, p in zip(self.conv, self.pooling):
+            x = f(x)  # (B, C, T)
+        x = x.transpose(1, -1)
+        x = self.out_proj(x.mean(1))  # (B, H)
+        return x
+class ConvDecoder(nn.Module):
+    def __init__(self, c, dropout, kernel_size=9, act='gelu'):
+        super().__init__()
+        self.c = c
+        self.dropout = dropout
+        self.pre_convs = nn.ModuleList()
+        self.pre_lns = nn.ModuleList()
+        for i in range(2):
+            self.pre_convs.append(TransformerFFNLayer(
+                c, c * 2, padding='LEFT', kernel_size=kernel_size, dropout=dropout, act=act))
+            self.pre_lns.append(LayerNorm(c))
+        self.layer_norm_attn = LayerNorm(c)
+        self.encoder_attn = MultiheadAttention(c, 1, encoder_decoder_attention=True, bias=False)
+        self.post_convs = nn.ModuleList()
+        self.post_lns = nn.ModuleList()
+        for i in range(8):
+            self.post_convs.append(TransformerFFNLayer(
+                c, c * 2, padding='LEFT', kernel_size=kernel_size, dropout=dropout, act=act))
+            self.post_lns.append(LayerNorm(c))
+    def forward(
+            self,
+            x,
+            encoder_out=None,
+            encoder_padding_mask=None,
+            incremental_state=None,
+            **kwargs,
+    ):
+        attn_logits = None
+        for conv, ln in zip(self.pre_convs, self.pre_lns):
+            residual = x
+            x = ln(x)
+            x = conv(x) + residual
+        if encoder_out is not None:
+            residual = x
+            x = self.layer_norm_attn(x)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                enc_dec_attn_constraint_mask=get_incremental_state(self, incremental_state,
+                                                                   'enc_dec_attn_constraint_mask'),
+            )
+            attn_logits = attn[1]
+            x = F.dropout(x, self.dropout, training=self.training)
+            x = residual + x
+        for conv, ln in zip(self.post_convs, self.post_lns):
+            residual = x
+            x = ln(x)
+            x = conv(x) + residual
+        return x, attn_logits
+    def clear_buffer(self, input, encoder_out=None, encoder_padding_mask=None, incremental_state=None):
+        self.encoder_attn.clear_buffer(incremental_state)
+        self.ffn.clear_buffer(incremental_state)
+    def set_buffer(self, name, tensor, incremental_state):
+        return set_incremental_state(self, incremental_state, name, tensor)

modules/audio2motion/transformer_models.py ADDED Viewed

	@@ -0,0 +1,208 @@

+from numpy import isin
+import torch
+import torch.nn as nn
+from modules.audio2motion.transformer_base import *
+DEFAULT_MAX_SOURCE_POSITIONS = 2000
+DEFAULT_MAX_TARGET_POSITIONS = 2000
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, hidden_size, dropout, kernel_size=None, num_heads=2, norm='ln'):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.op = EncSALayer(
+            hidden_size, num_heads, dropout=dropout,
+            attention_dropout=0.0, relu_dropout=dropout,
+            kernel_size=kernel_size
+            if kernel_size is not None else 9,
+            padding='SAME',
+            norm=norm, act='gelu'
+            )
+    def forward(self, x, **kwargs):
+        return self.op(x, **kwargs)
+######################
+# fastspeech modules
+######################
+class LayerNorm(torch.nn.LayerNorm):
+    """Layer normalization module.
+    :param int nout: output dim size
+    :param int dim: dimension to be normalized
+    """
+    def __init__(self, nout, dim=-1, eps=1e-5):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=eps)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        :param torch.Tensor x: input tensor
+        :return: layer normalized tensor
+        :rtype torch.Tensor
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
+class FFTBlocks(nn.Module):
+    def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, dropout=None,
+                 num_heads=2, use_pos_embed=True, use_last_norm=True, norm='ln',
+                 use_pos_embed_alpha=True):
+        super().__init__()
+        self.num_layers = num_layers
+        embed_dim = self.hidden_size = hidden_size
+        self.dropout = dropout if dropout is not None else 0.1
+        self.use_pos_embed = use_pos_embed
+        self.use_last_norm = use_last_norm
+        if use_pos_embed:
+            self.max_source_positions = DEFAULT_MAX_TARGET_POSITIONS
+            self.padding_idx = 0
+            self.pos_embed_alpha = nn.Parameter(torch.Tensor([1])) if use_pos_embed_alpha else 1
+            self.embed_positions = SinusoidalPositionalEmbedding(
+                embed_dim, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS,
+            )
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            TransformerEncoderLayer(self.hidden_size, self.dropout,
+                                    kernel_size=ffn_kernel_size, num_heads=num_heads,
+                                    norm=norm)
+            for _ in range(self.num_layers)
+        ])
+        if self.use_last_norm:
+            if norm == 'ln':
+                self.layer_norm = nn.LayerNorm(embed_dim)
+            elif norm == 'bn':
+                self.layer_norm = BatchNorm1dTBC(embed_dim)
+            elif norm == 'gn':
+                self.layer_norm = GroupNorm1DTBC(8, embed_dim)
+        else:
+            self.layer_norm = None
+    def forward(self, x, padding_mask=None, attn_mask=None, return_hiddens=False):
+        """
+        :param x: [B, T, C]
+        :param padding_mask: [B, T]
+        :return: [B, T, C] or [L, B, T, C]
+        """
+        padding_mask = x.abs().sum(-1).eq(0).data if padding_mask is None else padding_mask
+        nonpadding_mask_TB = 1 - padding_mask.transpose(0, 1).float()[:, :, None]  # [T, B, 1]
+        if self.use_pos_embed:
+            positions = self.pos_embed_alpha * self.embed_positions(x[..., 0])
+            x = x + positions
+            x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1) * nonpadding_mask_TB
+        hiddens = []
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask=padding_mask, attn_mask=attn_mask) * nonpadding_mask_TB
+            hiddens.append(x)
+        if self.use_last_norm:
+            x = self.layer_norm(x) * nonpadding_mask_TB
+        if return_hiddens:
+            x = torch.stack(hiddens, 0)  # [L, T, B, C]
+            x = x.transpose(1, 2)  # [L, B, T, C]
+        else:
+            x = x.transpose(0, 1)  # [B, T, C]
+        return x
+class SequentialSA(nn.Module):
+    def __init__(self,layers):
+        super(SequentialSA,self).__init__()
+        self.layers = nn.ModuleList(layers)
+    def forward(self,x,x_mask):
+        """
+        x: [batch, T, H]
+        x_mask: [batch, T]
+        """
+        pad_mask = 1. - x_mask
+        for layer in self.layers:
+            if isinstance(layer, EncSALayer):
+                x = x.permute(1,0,2)
+                x = layer(x,pad_mask)
+                x = x.permute(1,0,2)
+            elif isinstance(layer, nn.Linear):
+                x = layer(x) * x_mask.unsqueeze(2)
+            elif isinstance(layer, nn.AvgPool1d):
+                x = x.permute(0,2,1)
+                x = layer(x)
+                x = x.permute(0,2,1)
+            elif isinstance(layer, nn.PReLU):
+                bs, t, hid = x.shape
+                x = x.reshape([bs*t,hid])
+                x = layer(x)
+                x = x.reshape([bs, t, hid])
+            else: # Relu
+                x = layer(x)
+        return x
+class TransformerStyleFusionModel(nn.Module):
+    def __init__(self, num_heads=4, dropout = 0.1, out_dim = 64):
+        super(TransformerStyleFusionModel, self).__init__()
+        self.audio_layer = SequentialSA([
+            nn.Linear(29, 48),
+            nn.ReLU(48),
+            nn.Linear(48, 128),
+        ])
+        self.energy_layer = SequentialSA([
+            nn.Linear(1, 16),
+            nn.ReLU(16),
+            nn.Linear(16, 64),
+        ])
+        self.backbone1 = FFTBlocks(hidden_size=192,num_layers=3)
+        self.sty_encoder = nn.Sequential(*[
+            nn.Linear(135, 64),
+            nn.ReLU(),
+            nn.Linear(64, 128)
+        ])
+        self.backbone2 = FFTBlocks(hidden_size=320,num_layers=3)
+        self.out_layer = SequentialSA([
+            nn.AvgPool1d(kernel_size=2,stride=2,padding=0), #[b,hid,t_audio]=>[b,hid,t_audio//2]
+            nn.Linear(320,out_dim),
+            nn.PReLU(out_dim),
+            nn.Linear(out_dim,out_dim),
+        ])
+        self.dropout = nn.Dropout(p = dropout)
+    def forward(self, audio, energy, style, x_mask, y_mask):
+        pad_mask = 1. - x_mask
+        audio_feat = self.audio_layer(audio, x_mask)
+        energy_feat = self.energy_layer(energy, x_mask)
+        feat = torch.cat((audio_feat, energy_feat), dim=-1) # [batch, T, H=48+16]
+        feat = self.backbone1(feat, pad_mask)
+        feat = self.dropout(feat)
+        sty_feat = self.sty_encoder(style) # [batch,135]=>[batch, H=64]
+        sty_feat = sty_feat.unsqueeze(1).repeat(1, feat.shape[1], 1) # [batch, T, H=64]
+        feat = torch.cat([feat, sty_feat], dim=-1) # [batch, T, H=64+64]
+        feat = self.backbone2(feat, pad_mask) # [batch, T, H=128]
+        out = self.out_layer(feat, y_mask) # [batch, T//2, H=out_dim]
+        return out
+if __name__ == '__main__':
+    model = TransformerStyleFusionModel()
+    audio = torch.rand(4,200,29) # [B,T,H]
+    energy = torch.rand(4,200,1) # [B,T,H]
+    style = torch.ones(4,135) # [B,T]
+    x_mask = torch.ones(4,200) # [B,T]
+    x_mask[3,10:] = 0
+    ret = model(audio,energy,style, x_mask)
+    print(" ")

modules/audio2motion/utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+def squeeze(x, x_mask=None, n_sqz=2):
+    b, c, t = x.size()
+    t = (t // n_sqz) * n_sqz
+    x = x[:, :, :t]
+    x_sqz = x.view(b, c, t // n_sqz, n_sqz)
+    x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
+    if x_mask is not None:
+        x_mask = x_mask[:, :, n_sqz - 1::n_sqz]
+    else:
+        x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
+    return x_sqz * x_mask, x_mask
+def unsqueeze(x, x_mask=None, n_sqz=2):
+    b, c, t = x.size()
+    x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
+    x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
+    if x_mask is not None:
+        x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
+    else:
+        x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
+    return x_unsqz * x_mask, x_mask