image_root: '/export/share/datasets/vision/flickr30k/' ann_root: 'annotation' dataset: 'flickr' # set pretrained as a file path or an url pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_flickr.pth' # size of vit model; base or large vit: 'base' batch_size_train: 32 batch_size_test: 64 vit_grad_ckpt: True vit_ckpt_layer: 4 init_lr: 1e-5 # vit: 'large' # batch_size_train: 16 # batch_size_test: 32 # vit_grad_ckpt: True # vit_ckpt_layer: 10 # init_lr: 5e-6 image_size: 384 queue_size: 57600 alpha: 0.4 k_test: 128 negative_all_rank: False # optimizer weight_decay: 0.05 min_lr: 0 max_epoch: 6