image_root: '/export/share/datasets/vision/nocaps/' ann_root: 'annotation' # set pretrained as a file path or an url pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth' vit: 'base' batch_size: 32 image_size: 384 max_length: 20 min_length: 5 num_beams: 3 prompt: 'a picture of '