app: vjepa
data:
  batch_size: 8
  clip_duration: null
  crop_size: 224
  dataset_type: VideoDataset
  datasets:
  - /path/to/dataset.csv
  decode_one_clip: true
  filter_short_videos: false
  num_clips: 1
  num_frames: 16
  num_workers: 4
  patch_size: 16
  pin_mem: true
  sampling_rate: 4
  tubelet_size: 2
data_aug:
  auto_augment: false
  motion_shift: false
  random_resize_aspect_ratio:
  - 0.75
  - 1.35
  random_resize_scale:
  - 0.3
  - 1.0
  reprob: 0.0
logging:
  folder: /path/to/logs
  write_tag: jepa
loss:
  loss_exp: 1.0
  reg_coeff: 0.0
mask:
- aspect_ratio:
  - 0.75
  - 1.5
  max_keep: null
  max_temporal_keep: 1.0
  num_blocks: 8
  spatial_scale:
  - 0.15
  - 0.15
  temporal_scale:
  - 1.0
  - 1.0
- aspect_ratio:
  - 0.75
  - 1.5
  max_keep: null
  max_temporal_keep: 1.0
  num_blocks: 2
  spatial_scale:
  - 0.7
  - 0.7
  temporal_scale:
  - 1.0
  - 1.0
meta:
  dtype: bfloat16
  eval_freq: 100
  load_checkpoint: true
  read_checkpoint: /path/to/vitl16.pth.tar
  save_every_freq: 5
  seed: 234
  use_sdpa: true
model:
  model_name: vit_large
  pred_depth: 12
  pred_embed_dim: 384
  uniform_power: true
  use_mask_tokens: true
  zero_init_mask_tokens: true
nodes: 16
optimization:
  clip_grad: 10.0
  ema:
  - 0.998
  - 1.0
  epochs: 25
  final_lr: 1.0e-06
  final_weight_decay: 0.4
  ipe: 300
  ipe_scale: 1.25
  lr: 0.000625
  start_lr: 0.0002
  warmup: 40
  weight_decay: 0.04
tasks_per_node: 8