|
model: |
|
type: PixArtVideo_XL_1x2x2 |
|
space_scale: 0.5 |
|
time_scale: 1.0 |
|
mlp_type: "llama" |
|
|
|
position_embed_spaltial: "absolute" |
|
position_embed_temporal: "rope" |
|
|
|
norm_type: "llamarmsnorm" |
|
in_channels: 8 |
|
temp_window_size: [-1, 8, 8] |
|
adain_with_text: True |
|
qk_norm: False |
|
|
|
prob_text_condition: 1.0 |
|
prob_img_condition: 0 |
|
prob_img_condition_attn: 0 |
|
|
|
class_dropout_prob: 0.1 |
|
|
|
grad_checkpointing: True |
|
|
|
enable_frames_embedder: False |
|
enable_tgt_size_embedder: False |
|
|
|
clip_image_encoder: "pretrain_models/openai/clip-vit-large-patch14" |
|
|
|
vae: |
|
type: "CausualVAEVideo" |
|
|
|
config: "configs/vae_config.yaml" |
|
from_pretrained: "./pretrain_model/vidgen/vae/vae_pytorch_model.bin" |
|
|
|
|
|
text_encoder: |
|
type: "t5" |
|
from_pretrained: "pretrain_models/" |
|
model_max_length: 200 |
|
shardformer: True |
|
|
|
|
|
diffusion: |
|
type: "IDDPM" |
|
snr: False |
|
train_sampling_steps: 1000 |
|
prob_self_condition: 0 |
|
v_predict: False |
|
|
|
|
|
optimizer: |
|
learning_rate: 1e-4 |
|
weight_decay: 0 |
|
eps: 1e-8 |
|
min_lr_ratio: 0.95 |
|
gradient_clip: 1.0 |
|
|
|
num_frames_video: 17 |
|
num_slice_for_long_video: -1 |
|
|
|
resolution_video: -1 |
|
resolution_image: -1 |
|
mode_various_resolution: False |
|
|
|
precision: "bf16" |
|
seed: 42 |
|
workers: 4 |
|
grad_checkpoint: False |
|
gradient_accumulation_steps: 4 |
|
logging_steps: 10 |
|
|