zejunyang
commited on
Commit
•
558ddd8
1
Parent(s):
8b870cd
init
Browse files
configs/inference/head_pose_temp/pose_temp.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed399e8a6dc5faab4eb98a676f211dbc108bddad5ec27164540cf25ad6d96818
|
3 |
+
size 8048
|
configs/inference/inference_audio.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a2m_model:
|
2 |
+
out_dim: 1404
|
3 |
+
latent_dim: 512
|
4 |
+
model_path: ./pretrained_model/wav2vec2-base-960h
|
5 |
+
only_last_fetures: True
|
6 |
+
from_pretrained: True
|
7 |
+
|
8 |
+
a2p_model:
|
9 |
+
out_dim: 6
|
10 |
+
latent_dim: 512
|
11 |
+
model_path: ./pretrained_model/wav2vec2-base-960h
|
12 |
+
only_last_fetures: True
|
13 |
+
from_pretrained: True
|
14 |
+
|
15 |
+
pretrained_model:
|
16 |
+
a2m_ckpt: ./pretrained_model/audio2mesh.pt
|
17 |
+
a2p_ckpt: ./pretrained_model/audio2pose.pt
|
configs/inference/inference_v2.yaml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
unet_additional_kwargs:
|
2 |
+
use_inflated_groupnorm: true
|
3 |
+
unet_use_cross_frame_attention: false
|
4 |
+
unet_use_temporal_attention: false
|
5 |
+
use_motion_module: true
|
6 |
+
motion_module_resolutions:
|
7 |
+
- 1
|
8 |
+
- 2
|
9 |
+
- 4
|
10 |
+
- 8
|
11 |
+
motion_module_mid_block: true
|
12 |
+
motion_module_decoder_only: false
|
13 |
+
motion_module_type: Vanilla
|
14 |
+
motion_module_kwargs:
|
15 |
+
num_attention_heads: 8
|
16 |
+
num_transformer_block: 1
|
17 |
+
attention_block_types:
|
18 |
+
- Temporal_Self
|
19 |
+
- Temporal_Self
|
20 |
+
temporal_position_encoding: true
|
21 |
+
temporal_position_encoding_max_len: 32
|
22 |
+
temporal_attention_dim_div: 1
|
23 |
+
|
24 |
+
noise_scheduler_kwargs:
|
25 |
+
beta_start: 0.00085
|
26 |
+
beta_end: 0.012
|
27 |
+
beta_schedule: "linear"
|
28 |
+
clip_sample: false
|
29 |
+
steps_offset: 1
|
30 |
+
### Zero-SNR params
|
31 |
+
prediction_type: "v_prediction"
|
32 |
+
rescale_betas_zero_snr: True
|
33 |
+
timestep_spacing: "trailing"
|
34 |
+
|
35 |
+
sampler: DDIM
|
configs/prompts/animation_audio.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pretrained_base_model_path: './pretrained_model/stable-diffusion-v1-5'
|
2 |
+
pretrained_vae_path: './pretrained_model/sd-vae-ft-mse'
|
3 |
+
image_encoder_path: './pretrained_model/image_encoder'
|
4 |
+
|
5 |
+
denoising_unet_path: "./pretrained_model/denoising_unet.pth"
|
6 |
+
reference_unet_path: "./pretrained_model/reference_unet.pth"
|
7 |
+
pose_guider_path: "./pretrained_model/pose_guider.pth"
|
8 |
+
motion_module_path: "./pretrained_model/motion_module.pth"
|
9 |
+
|
10 |
+
audio_inference_config: "./configs/inference/inference_audio.yaml"
|
11 |
+
inference_config: "./configs/inference/inference_v2.yaml"
|
12 |
+
weight_dtype: 'fp16'
|
13 |
+
|
14 |
+
pose_temp: "./configs/inference/head_pose_temp/pose_temp.npy"
|
configs/prompts/animation_facereenac.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pretrained_base_model_path: './pretrained_model/stable-diffusion-v1-5'
|
2 |
+
pretrained_vae_path: './pretrained_model/sd-vae-ft-mse'
|
3 |
+
image_encoder_path: './pretrained_model/image_encoder'
|
4 |
+
|
5 |
+
denoising_unet_path: "./pretrained_model/denoising_unet.pth"
|
6 |
+
reference_unet_path: "./pretrained_model/reference_unet.pth"
|
7 |
+
pose_guider_path: "./pretrained_model/pose_guider.pth"
|
8 |
+
motion_module_path: "./pretrained_model/motion_module.pth"
|
9 |
+
|
10 |
+
inference_config: "./configs/inference/inference_v2.yaml"
|
11 |
+
weight_dtype: 'fp16'
|