app: vjepa data: batch_size: 8 clip_duration: null crop_size: 224 dataset_type: VideoDataset datasets: - /path/to/dataset.csv decode_one_clip: true filter_short_videos: false num_clips: 1 num_frames: 16 num_workers: 4 patch_size: 16 pin_mem: true sampling_rate: 4 tubelet_size: 2 data_aug: auto_augment: false motion_shift: false random_resize_aspect_ratio: - 0.75 - 1.35 random_resize_scale: - 0.3 - 1.0 reprob: 0.0 logging: folder: /path/to/logs write_tag: jepa loss: loss_exp: 1.0 reg_coeff: 0.0 mask: - aspect_ratio: - 0.75 - 1.5 max_keep: null max_temporal_keep: 1.0 num_blocks: 8 spatial_scale: - 0.15 - 0.15 temporal_scale: - 1.0 - 1.0 - aspect_ratio: - 0.75 - 1.5 max_keep: null max_temporal_keep: 1.0 num_blocks: 2 spatial_scale: - 0.7 - 0.7 temporal_scale: - 1.0 - 1.0 meta: dtype: bfloat16 eval_freq: 100 load_checkpoint: true read_checkpoint: /path/to/vitl16.pth.tar save_every_freq: 5 seed: 234 use_sdpa: true model: model_name: vit_large pred_depth: 12 pred_embed_dim: 384 uniform_power: true use_mask_tokens: true zero_init_mask_tokens: true nodes: 16 optimization: clip_grad: 10.0 ema: - 0.998 - 1.0 epochs: 25 final_lr: 1.0e-06 final_weight_decay: 0.4 ipe: 300 ipe_scale: 1.25 lr: 0.000625 start_lr: 0.0002 warmup: 40 weight_decay: 0.04 tasks_per_node: 8