model:
    type: PixArtVideo_XL_1x2x2
    space_scale: 0.5
    time_scale: 1.0
    mlp_type: "llama"
    #enable_rope: True
    position_embed_spaltial: "absolute"
    position_embed_temporal: "rope"

    norm_type: "llamarmsnorm"
    in_channels: 8                    # to be consistent with videovae
    temp_window_size: [-1, 8, 8]      # windown attn for temporal-attn
    adain_with_text: True
    qk_norm: False
    
    prob_text_condition: 1.0
    prob_img_condition: 0
    prob_img_condition_attn: 0

    class_dropout_prob: 0.1

    grad_checkpointing: True

    enable_frames_embedder: False 
    enable_tgt_size_embedder: False

clip_image_encoder: "pretrain_models/openai/clip-vit-large-patch14"

vae:
    type: "CausualVAEVideo"
    # z=8
    config: "configs/vae_config.yaml"
    from_pretrained: "./pretrain_model/vidgen/vae/vae_pytorch_model.bin"


text_encoder:
    type: "t5"
    from_pretrained: "pretrain_models/"
    model_max_length: 200
    shardformer: True


diffusion:
    type: "IDDPM"
    snr: False
    train_sampling_steps: 1000
    prob_self_condition: 0
    v_predict: False


optimizer:
    learning_rate: 1e-4
    weight_decay: 0
    eps: 1e-8                  
    min_lr_ratio: 0.95
gradient_clip: 1.0

num_frames_video: 17            # base frames of one video slice 
num_slice_for_long_video: -1    # how many 2s slice is the long video be split, -1 denotes dynamic

resolution_video: -1
resolution_image: -1
mode_various_resolution: False

precision: "bf16"
seed: 42
workers: 4
grad_checkpoint: False
gradient_accumulation_steps: 4
logging_steps: 10