File size: 3,807 Bytes
5e02fce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# #################################
# The recipe for distilling the CLAP baseline.
#
# Author:
#  * Francesco Paissan 2024
# #################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]

# Set up folders for reading from and writing to -- if null dataset is ignored
esc_folder: null
us8k_folder: null
tut17_folder: null

audiocaps_folder: null
macs_folder: null
clotho_folder: null
fsd50k_folder: null

device: "cpu"

projection_only: False

# Audio Enc Student type
audioenc_name_student: phinet_alpha_1.50_beta_0.75_t0_6_N_7
aud_emb_dim_student: 2048

zs_eval: False

clap_ckpt: "https://zenodo.org/records/7312125/files/CLAP_weights_2022.pth"

experiment_name: tinyCLAP
output_folder: !ref ./results/<experiment_name>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Tensorboard logs
use_tensorboard: False
tensorboard_logs_folder: !ref <output_folder>/tb_logs/

ckpt_interval_minutes: 15 # save checkpoint every N min

# Training parameters
number_of_epochs: 100
batch_size: 64

lr: 0.012

sample_rate: 44100
signal_length_s: 5

# Feature parameters
n_mels: 64
spec_mag_power: 1

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>

lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
    factor: 0.1
    patience: 10

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        student_model: !ref <student_model>
        counter: !ref <epoch_counter>

pretrained_CLAP: !ref fpaissan/tinyCLAP/<audioenc_name_student>.ckpt
load_CLAP: !new:speechbrain.utils.parameter_transfer.Pretrainer
    collect_in: !ref <save_folder>
    loadables:
        student_model: !ref <student_model>
    paths:
        student_model: !ref <pretrained_CLAP>

fmin: 50
fmax: 14000
aud_emb_classes_num: 527

emb_norm_type: bn
aud_emb_dim: 2048
txt_emb_dim: 768
shared_emb_dim: 1024
text_max_length: 100

use_pretrained: True
clap: !new:modules.CLAP
    audioenc_name: Cnn14
    classes_num: !ref <aud_emb_classes_num>
    out_emb: !ref <aud_emb_dim>
    text_model: bert-base-uncased
    transformer_embed_dim: !ref <txt_emb_dim>
    d_proj: !ref <shared_emb_dim>
    pretrained_weights: !ref <use_pretrained>
    CLAP_weights: !ref <clap_ckpt>
    audioenc_name_student: !ref <audioenc_name_student>
    out_emb_student: !ref <aud_emb_dim_student>

txt_tokenizer: !apply:transformers.AutoTokenizer.from_pretrained
    pretrained_model_name_or_path: bert-base-uncased

# Interpretation hyperparams
K: 1024

# pre-processing
n_fft: 1024
hop_length: 320
win_length: 1024
use_melspectra_log1p: False
use_melspectra: True
use_stft2mel: True

# Spectrogram extractor
spectrogram_extractor: !new:torchlibrosa.stft.Spectrogram
    n_fft: !ref <n_fft>
    hop_length: !ref <hop_length>
    win_length: !ref <win_length>
    window: "hann"
    center: True
    pad_mode: "reflect"
    freeze_parameters: True

# Logmel feature extractor
logmel_extractor: !new:torchlibrosa.stft.LogmelFilterBank
    sr: !ref <sample_rate>
    n_fft: !ref <win_length>
    n_mels: !ref <n_mels>
    fmin: !ref <fmin>
    fmax: !ref <fmax>
    ref: 1.0
    amin: 0.0000000001
    top_db: null
    freeze_parameters: True


student_model: !new:modules.AudioEncoder
    audioenc_name: !ref <audioenc_name_student>
    d_in: !ref <aud_emb_dim_student>
    d_out: !ref <shared_emb_dim>
    classes_num: !ref <aud_emb_classes_num>

modules:
    clap_student: !ref <student_model>