character-360 / inference.yaml
aki-0421
F: add
a3a3ae4 unverified
raw
history blame
5.05 kB
model:
target: vtdm.vtdm_gen_v01.VideoLDM
base_learning_rate: 1.0e-05
params:
input_key: video
scale_factor: 0.18215
log_keys: caption
num_samples: 25 #frame_rate
trained_param_keys:
- diffusion_model.label_emb.0.0.weight
- .emb_layers.
- .time_stack.
en_and_decode_n_samples_a_time: 25 #frame_rate
disable_first_stage_autocast: true
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.Denoiser
params:
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
network_config:
target: sgm.modules.diffusionmodules.video_model.VideoUNet
params:
adm_in_channels: 768
num_classes: sequential
use_checkpoint: true
in_channels: 8
out_channels: 4
model_channels: 320
attention_resolutions:
- 4
- 2
- 1
num_res_blocks: 2
channel_mult:
- 1
- 2
- 4
- 4
num_head_channels: 64
use_linear_in_transformer: true
transformer_depth: 1
context_dim: 1024
spatial_transformer_attn_type: softmax-xformers
extra_ff_mix_layer: true
use_spatial_context: true
merge_strategy: learned_with_images
video_kernel_size:
- 3
- 1
- 1
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: false
input_key: cond_frames_without_noise
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
params:
n_cond_frames: 1
n_copies: 1
open_clip_embedding_config:
target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
params:
version: ckpts/open_clip_pytorch_model.bin
freeze: true
- is_trainable: false
input_key: video
ucg_rate: 0.0
target: vtdm.encoders.AesEmbedder
- is_trainable: false
input_key: elevation
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
- input_key: cond_frames
is_trainable: false
ucg_rate: 0.1
target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
params:
disable_encoder_autocast: true
n_cond_frames: 1
n_copies: 25 #frame_rate
is_ae: true
encoder_config:
target: sgm.models.autoencoder.AutoencoderKLModeOnly
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
- input_key: cond_aug
is_trainable: false
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
first_stage_config:
target: sgm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
params:
num_frames: 25 #frame_rate
batch2model_keys:
- num_video_frames
- image_only_indicator
sigma_sampler_config:
target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
params:
p_mean: 1.0
p_std: 1.6
loss_weighting_config:
target: sgm.modules.diffusionmodules.loss_weighting.VWeighting
sampler_config:
target: sgm.modules.diffusionmodules.sampling.LinearMultistepSampler
params:
num_steps: 50
verbose: True
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
params:
sigma_max: 700.0
guider_config:
target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
params:
num_frames: 25 #frame_rate
max_scale: 2.5
min_scale: 1.0