File size: 3,807 Bytes
5e02fce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# #################################
# The recipe for distilling the CLAP baseline.
#
# Author:
# * Francesco Paissan 2024
# #################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
# Set up folders for reading from and writing to -- if null dataset is ignored
esc_folder: null
us8k_folder: null
tut17_folder: null
audiocaps_folder: null
macs_folder: null
clotho_folder: null
fsd50k_folder: null
device: "cpu"
projection_only: False
# Audio Enc Student type
audioenc_name_student: phinet_alpha_1.50_beta_0.75_t0_6_N_7
aud_emb_dim_student: 2048
zs_eval: False
clap_ckpt: "https://zenodo.org/records/7312125/files/CLAP_weights_2022.pth"
experiment_name: tinyCLAP
output_folder: !ref ./results/<experiment_name>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
# Tensorboard logs
use_tensorboard: False
tensorboard_logs_folder: !ref <output_folder>/tb_logs/
ckpt_interval_minutes: 15 # save checkpoint every N min
# Training parameters
number_of_epochs: 100
batch_size: 64
lr: 0.012
sample_rate: 44100
signal_length_s: 5
# Feature parameters
n_mels: 64
spec_mag_power: 1
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
opt_class: !name:torch.optim.Adam
lr: !ref <lr>
lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
factor: 0.1
patience: 10
# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
student_model: !ref <student_model>
counter: !ref <epoch_counter>
pretrained_CLAP: !ref fpaissan/tinyCLAP/<audioenc_name_student>.ckpt
load_CLAP: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: !ref <save_folder>
loadables:
student_model: !ref <student_model>
paths:
student_model: !ref <pretrained_CLAP>
fmin: 50
fmax: 14000
aud_emb_classes_num: 527
emb_norm_type: bn
aud_emb_dim: 2048
txt_emb_dim: 768
shared_emb_dim: 1024
text_max_length: 100
use_pretrained: True
clap: !new:modules.CLAP
audioenc_name: Cnn14
classes_num: !ref <aud_emb_classes_num>
out_emb: !ref <aud_emb_dim>
text_model: bert-base-uncased
transformer_embed_dim: !ref <txt_emb_dim>
d_proj: !ref <shared_emb_dim>
pretrained_weights: !ref <use_pretrained>
CLAP_weights: !ref <clap_ckpt>
audioenc_name_student: !ref <audioenc_name_student>
out_emb_student: !ref <aud_emb_dim_student>
txt_tokenizer: !apply:transformers.AutoTokenizer.from_pretrained
pretrained_model_name_or_path: bert-base-uncased
# Interpretation hyperparams
K: 1024
# pre-processing
n_fft: 1024
hop_length: 320
win_length: 1024
use_melspectra_log1p: False
use_melspectra: True
use_stft2mel: True
# Spectrogram extractor
spectrogram_extractor: !new:torchlibrosa.stft.Spectrogram
n_fft: !ref <n_fft>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
window: "hann"
center: True
pad_mode: "reflect"
freeze_parameters: True
# Logmel feature extractor
logmel_extractor: !new:torchlibrosa.stft.LogmelFilterBank
sr: !ref <sample_rate>
n_fft: !ref <win_length>
n_mels: !ref <n_mels>
fmin: !ref <fmin>
fmax: !ref <fmax>
ref: 1.0
amin: 0.0000000001
top_db: null
freeze_parameters: True
student_model: !new:modules.AudioEncoder
audioenc_name: !ref <audioenc_name_student>
d_in: !ref <aud_emb_dim_student>
d_out: !ref <shared_emb_dim>
classes_num: !ref <aud_emb_classes_num>
modules:
clap_student: !ref <student_model>
|