globals:
    target_fps: 32
    target_nframes: 64

unet:
    _class_name: UNetSpatioTemporalConditionModel
    addition_time_embed_dim: 1
    block_out_channels: 
        - 128
        - 256
        - 256
        - 512
    cross_attention_dim: 1
    down_block_types: 
        - CrossAttnDownBlockSpatioTemporal
        - CrossAttnDownBlockSpatioTemporal
        - CrossAttnDownBlockSpatioTemporal
        - DownBlockSpatioTemporal
    in_channels: 8
    layers_per_block: 2
    num_attention_heads: 
        - 8
        - 16
        - 16
        - 32
    num_frames: ${globals.target_nframes}
    out_channels: 4
    projection_class_embeddings_input_dim: 1
    sample_size: 14
    transformer_layers_per_block: 1
    up_block_types: 
        - UpBlockSpatioTemporal
        - CrossAttnUpBlockSpatioTemporal
        - CrossAttnUpBlockSpatioTemporal
        - CrossAttnUpBlockSpatioTemporal

noise_scheduler:
    _class_name: DDPMScheduler
    num_train_timesteps: 1000
    beta_start: 0.0001
    beta_end: 0.02
    beta_schedule: linear # linear, scaled_linear, or squaredcos_cap_v2
    variance_type: fixed_small # fixed_small, fixed_small_log, fixed_large, fixed_large_log, learned or learned_range
    clip_sample: true
    clip_sample_range: 4.0 # default 1 
    prediction_type: v_prediction # epsilon, sample, v_prediction
    thresholding: false # do not touch
    dynamic_thresholding_ratio: 0.995 # unused
    sample_max_value: 1.0 # unused
    timestep_spacing: "leading" #
    steps_offset: 0 # unused