# ################################# # The recipe for distilling the CLAP baseline. # # Author: # * Francesco Paissan 2024 # ################################# # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1234 __set_seed: !!python/object/apply:torch.manual_seed [!ref ] # Set up folders for reading from and writing to -- if null dataset is ignored esc_folder: null us8k_folder: null tut17_folder: null audiocaps_folder: null macs_folder: null clotho_folder: null fsd50k_folder: null device: "cpu" projection_only: False # Audio Enc Student type audioenc_name_student: phinet_alpha_1.50_beta_0.75_t0_6_N_7 aud_emb_dim_student: 2048 zs_eval: False clap_ckpt: "https://zenodo.org/records/7312125/files/CLAP_weights_2022.pth" experiment_name: tinyCLAP output_folder: !ref ./results// save_folder: !ref /save train_log: !ref /train_log.txt # Tensorboard logs use_tensorboard: False tensorboard_logs_folder: !ref /tb_logs/ ckpt_interval_minutes: 15 # save checkpoint every N min # Training parameters number_of_epochs: 100 batch_size: 64 lr: 0.012 sample_rate: 44100 signal_length_s: 5 # Feature parameters n_mels: 64 spec_mag_power: 1 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref opt_class: !name:torch.optim.Adam lr: !ref lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau factor: 0.1 patience: 10 # Logging + checkpoints train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: student_model: !ref counter: !ref pretrained_CLAP: !ref fpaissan/tinyCLAP/.ckpt load_CLAP: !new:speechbrain.utils.parameter_transfer.Pretrainer collect_in: !ref loadables: student_model: !ref paths: student_model: !ref fmin: 50 fmax: 14000 aud_emb_classes_num: 527 emb_norm_type: bn aud_emb_dim: 2048 txt_emb_dim: 768 shared_emb_dim: 1024 text_max_length: 100 use_pretrained: True clap: !new:modules.CLAP audioenc_name: Cnn14 classes_num: !ref out_emb: !ref text_model: bert-base-uncased transformer_embed_dim: !ref d_proj: !ref pretrained_weights: !ref CLAP_weights: !ref audioenc_name_student: !ref out_emb_student: !ref txt_tokenizer: !apply:transformers.AutoTokenizer.from_pretrained pretrained_model_name_or_path: bert-base-uncased # Interpretation hyperparams K: 1024 # pre-processing n_fft: 1024 hop_length: 320 win_length: 1024 use_melspectra_log1p: False use_melspectra: True use_stft2mel: True # Spectrogram extractor spectrogram_extractor: !new:torchlibrosa.stft.Spectrogram n_fft: !ref hop_length: !ref win_length: !ref window: "hann" center: True pad_mode: "reflect" freeze_parameters: True # Logmel feature extractor logmel_extractor: !new:torchlibrosa.stft.LogmelFilterBank sr: !ref n_fft: !ref n_mels: !ref fmin: !ref fmax: !ref ref: 1.0 amin: 0.0000000001 top_db: null freeze_parameters: True student_model: !new:modules.AudioEncoder audioenc_name: !ref d_in: !ref d_out: !ref classes_num: !ref modules: clap_student: !ref