image_res: 224
batch_size_train: 16

batch_size_test: 8

warm_up: True


optimizer: {opt: adamW, lr: 2e-5, weight_decay: 0.02, prompt_lr: 1e-5}
schedular: {sched: cosine, scheduler_groups: 0 , lr: 2e-5, epochs: 26, min_lr: 1e-6, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 4, cooldown_epochs: 0}

use_vis_prefix: True
start_layer_idx: 19
end_layer_idx: 31

injected_hidden_states: 6

lm_loss_weight: 0.1 

unfreeze_text_layer_norm: False
unfreeze_vision_layer_norm: False


num_workers: 4


replace_added_tokens: True


use_cache: False

shift_labels: False

append_eos_token: True

num_beams: 3
do_sample: False
# Prompt tuning
prompt_tuning: True 
prompt_len: 10 


# video 
modality: 'video'
dataset_name: 'msrvtt'

train_split: 'msrvtt_caption_train7k'
val_split: 'msrvtt_caption_test'
test_split: 'msrvtt_caption_test'


num_frames: 16
sample_type: 'rand'
num_tries: 1
as_images: False
all_vis_tokens: False


vision_model_name: 'timesformer'
pretrained_model: '/gpfswork/rech/dyf/ugz83ue/.cache/torch/hub/checkpoints/TimeSformer_divST_8x32_224_K600.pyth'