|
image_res: 224 |
|
batch_size_train: 8 |
|
|
|
batch_size_test: 64 |
|
k_test: 64 |
|
|
|
warm_up: True |
|
|
|
|
|
|
|
optimizer: {opt: adamW, lr: 2e-5, weight_decay: 0.02, prompt_lr: 1e-5} |
|
schedular: {sched: cosine, scheduler_groups: 0 , lr: 2e-5, epochs: 26, min_lr: 1e-6, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 4, cooldown_epochs: 0} |
|
|
|
use_vis_prefix: True |
|
start_layer_idx: 19 |
|
end_layer_idx: 31 |
|
|
|
injected_hidden_states: 6 |
|
|
|
lm_loss_weight: 0.1 |
|
|
|
unfreeze_text_layer_norm: False |
|
unfreeze_vision_layer_norm: False |
|
|
|
|
|
num_workers: 4 |
|
|
|
special_answer_token: '</a>' |
|
|
|
replace_added_tokens: True |
|
|
|
|
|
use_cache: False |
|
|
|
connector_per_text_layer: False |
|
|
|
text_step: 1 |
|
|
|
num_beams: 3 |
|
do_sample: False |
|
|
|
prompt_tuning: True |
|
prompt_len: 10 |
|
|
|
|
|
|
|
modality: 'video' |
|
dataset_name: 'msvd' |
|
|
|
train_split: 'msvd_vqa_train' |
|
val_split: 'msvd_vqa_val' |
|
test_split: 'msvd_vqa_test' |
|
|
|
|
|
num_frames: 8 |
|
sample_type: 'rand' |
|
num_tries: 1 |
|
as_images: False |
|
all_vis_tokens: False |
|
|
|
|
|
vision_model_name: 'timesformer' |
|
pretrained_model: '/gpfswork/rech/dyf/ugz83ue/.cache/torch/hub/checkpoints/TimeSformer_divST_8x32_224_K600.pyth' |
|
|