image_res: 224 batch_size_train: 8 batch_size_test: 64 warm_up: True optimizer: {opt: adamW, lr: 2e-5, weight_decay: 0.02, prompt_lr: 1e-5} schedular: {sched: cosine, scheduler_groups: 0 , lr: 2e-5, epochs: 30, min_lr: 1e-6, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 4, cooldown_epochs: 0} use_vis_prefix: True start_layer_idx: 19 end_layer_idx: 31 injected_hidden_states: 6 lm_loss_weight: 0.1 unfreeze_text_layer_norm: False unfreeze_vision_layer_norm: False num_workers: 4 replace_added_tokens: True use_cache: False shift_labels: False append_eos_token: True num_beams: 3 do_sample: False # Prompt tuning prompt_tuning: True prompt_len: 10 modality: 'audio' dataset_name: 'audiocaps' train_split: 'audiocaps_caption_train' val_split: 'audiocaps_caption_val' test_split: 'audiocaps_caption_test' melbins: 128 target_length: 1024 num_tries: 8 skip_norm: False norm_mean: -4.2677393 norm_std: 4.5689974 noise: False freqm_p: 24 timem_p: 96 all_vis_tokens: False vision_model_name: 'ast' pretrained_model: '/gpfswork/rech/dyf/ugz83ue/.cache/torch/hub/checkpoints/audioset_10_10_0.4593.pth'