#!/bin/bash DATASET_ROOT=/dataset/imageNet100_sicy/train/ #/raid/common/imagenet-raw/ ## train ViT-small for 800 epochs NPROC_PER_NODE=16 # GPU numbers BATCH_SIZE_PER_GPU=64 DEBUG=false # debug = true, then we only load subset of the whole training dataset python -m torch.distributed.launch --nproc_per_node=$NPROC_PER_NODE main.py \ --data_path $DATASET_ROOT \ --output_dir $OUTPUT_ROOT \ --arch vit_small \ --instance_queue_size 65536 \ --local_group_queue_size 65536 \ --use_bn_in_head false \ --instance_out_dim 256 \ --instance_temp 0.2 \ --local_group_out_dim 256 \ --local_group_temp 0.2 \ --local_group_knn_top_n 8 \ --group_out_dim 65536 \ --group_student_temp 0.1 \ --group_warmup_teacher_temp 0.04 \ --group_teacher_temp 0.07 \ --group_warmup_teacher_temp_epochs 30 \ --norm_last_layer false \ --norm_before_pred true \ --batch_size_per_gpu $BATCH_SIZE_PER_GPU \ --epochs 800 \ --warmup_epochs 10 \ --clip_grad 3.0 \ --lr 0.0008 \ --min_lr 1e-06 \ --patch_embed_lr_mult 0.2 \ --drop_path_rate 0.1 \ --weight_decay 0.04 \ --weight_decay_end 0.1 \ --freeze_last_layer 1 \ --momentum_teacher 0.996 \ --use_fp16 false \ --local_crops_number 10 \ --size_crops 96 \ --global_crops_scale 0.25 1 \ --local_crops_scale 0.05 0.25 \ --timm_auto_augment_par rand-m9-mstd0.5-inc1 \ --prob 0.5 \ --use_prefetcher true \ --debug $DEBUG