|
|
|
text_encoder: bert-base-uncased |
|
bert_config: configs/config_bert.json |
|
vit_type: beit |
|
vit_zoo: |
|
beit: microsoft/beit-base-patch16-224-pt22k-ft22k |
|
vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]} |
|
|
|
vision_encoder_args: |
|
token_keep_rate: 0.7 |
|
token_keep_strategy: cls_attn |
|
token_drop_loc: [3, 6, 9] |
|
sparse_local_attn: 1 |
|
sparse_random_attn: 5 |
|
attn_block_size: 56 |
|
|
|
image_res: 224 |
|
embed_dim: 256 |
|
video_input: |
|
num_frames: 4 |
|
reader: decord |
|
sample_type: rand |
|
num_frames_test: 16 |
|
sample_type_test: middle |
|
max_txt_l: |
|
image: 32 |
|
video: 32 |
|
|
|
batch_size: |
|
image: 8 |
|
video: 8 |
|
batch_size_test: |
|
image: 8 |
|
video: 8 |
|
k_test: 128 |
|
temp: 0.18 |
|
mlm_prob: 0.5 |
|
|