MODEL: WEIGHTS: '' compute_precision: grad_scaler: true teacher: backbone: sharding_strategy: SHARD_GRAD_OP mixed_precision: param_dtype: fp16 reduce_dtype: fp16 buffer_dtype: fp32 dino_head: sharding_strategy: SHARD_GRAD_OP mixed_precision: param_dtype: fp16 reduce_dtype: fp16 buffer_dtype: fp32 ibot_head: sharding_strategy: SHARD_GRAD_OP mixed_precision: param_dtype: fp16 reduce_dtype: fp16 buffer_dtype: fp32 student: backbone: sharding_strategy: SHARD_GRAD_OP mixed_precision: param_dtype: fp16 reduce_dtype: fp16 buffer_dtype: fp32 dino_head: sharding_strategy: SHARD_GRAD_OP mixed_precision: param_dtype: fp16 reduce_dtype: fp32 buffer_dtype: fp32 ibot_head: sharding_strategy: SHARD_GRAD_OP mixed_precision: param_dtype: fp16 reduce_dtype: fp32 buffer_dtype: fp32 dino: loss_weight: 1.0 head_n_prototypes: 65536 head_bottleneck_dim: 256 head_nlayers: 3 head_hidden_dim: 2048 koleo_loss_weight: 0.1 ibot: loss_weight: 1.0 mask_sample_probability: 0.5 mask_ratio_min_max: - 0.1 - 0.5 separate_head: false head_n_prototypes: 65536 head_bottleneck_dim: 256 head_nlayers: 3 head_hidden_dim: 2048 train: batch_size_per_gpu: 64 dataset_path: ImageNet:split=TRAIN output_dir: . saveckp_freq: 20 seed: 0 num_workers: 10 OFFICIAL_EPOCH_LENGTH: 1250 cache_dataset: true centering: "centering" # or "sinkhorn_knopp" student: arch: vit_large patch_size: 16 drop_path_rate: 0.3 layerscale: 1.0e-05 drop_path_uniform: true pretrained_weights: '' ffn_layer: "mlp" block_chunks: 0 qkv_bias: true proj_bias: true ffn_bias: true teacher: momentum_teacher: 0.992 final_momentum_teacher: 1 warmup_teacher_temp: 0.04 teacher_temp: 0.07 warmup_teacher_temp_epochs: 30 optim: epochs: 100 weight_decay: 0.04 weight_decay_end: 0.4 base_lr: 0.004 # learning rate for a batch size of 1024 lr: 0. # will be set after applying scaling rule warmup_epochs: 10 min_lr: 1.0e-06 clip_grad: 3.0 freeze_last_layer_epochs: 1 scaling_rule: sqrt_wrt_1024 patch_embed_lr_mult: 0.2 layerwise_decay: 0.9 adamw_beta1: 0.9 adamw_beta2: 0.999 crops: global_crops_scale: - 0.32 - 1.0 local_crops_number: 8 local_crops_scale: - 0.05 - 0.32 global_crops_size: 224 local_crops_size: 96 evaluation: eval_period_iterations: 12500