multimodalart's picture
Upload folder using huggingface_hub
5a510e7 verified
source_image: ./default.png
driving_audio: default.wav
weight_dtype: fp16
data:
n_motion_frames: 2
n_sample_frames: 16
source_image:
width: 512
height: 512
driving_audio:
sample_rate: 16000
export_video:
fps: 25
inference_steps: 40
cfg_scale: 3.5
audio_ckpt_dir: ./pretrained_models/hallo
base_model_path: ./pretrained_models/stable-diffusion-v1-5
motion_module_path: ./pretrained_models/motion_module/mm_sd_v15_v2.ckpt
face_analysis:
model_path: ./pretrained_models/face_analysis
wav2vec:
model_path: ./pretrained_models/wav2vec/wav2vec2-base-960h
features: all
audio_separator:
model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx
vae:
model_path: ./pretrained_models/sd-vae-ft-mse
save_path: ./.cache
face_expand_ratio: 1.1
pose_weight: 1.1
face_weight: 1.1
lip_weight: 1.1
unet_additional_kwargs:
use_inflated_groupnorm: true
unet_use_cross_frame_attention: false
unet_use_temporal_attention: false
use_motion_module: true
use_audio_module: true
motion_module_resolutions:
- 1
- 2
- 4
- 8
motion_module_mid_block: true
motion_module_decoder_only: false
motion_module_type: Vanilla
motion_module_kwargs:
num_attention_heads: 8
num_transformer_block: 1
attention_block_types:
- Temporal_Self
- Temporal_Self
temporal_position_encoding: true
temporal_position_encoding_max_len: 32
temporal_attention_dim_div: 1
audio_attention_dim: 768
stack_enable_blocks_name:
- "up"
- "down"
- "mid"
stack_enable_blocks_depth: [0,1,2,3]
enable_zero_snr: true
noise_scheduler_kwargs:
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "linear"
clip_sample: false
steps_offset: 1
### Zero-SNR params
prediction_type: "v_prediction"
rescale_betas_zero_snr: True
timestep_spacing: "trailing"
sampler: DDIM