File size: 4,309 Bytes
c858225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Generated 2023-07-14 from:
# /data2/cloned_repos/speechbrain-clone/recipes/ESC50/interpret/hparams/piq.yaml
# yamllint disable
# #################################
# The recipe for training PIQ on the ESC50 dataset.
#
# Author:
#  * Cem Subakan 2022, 2023
#  * Francesco Paissan 2022, 2023
#  (based on the SpeechBrain UrbanSound8k recipe)
# #################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !!python/object/apply:torch.manual_seed [1234]

# Set up folders for reading from and writing to
# Dataset must already exist at `audio_data_folder`
data_folder: /data2/ESC-50-master
                          # e.g., /localscratch/UrbanSound8K
audio_data_folder: /data2/ESC-50-master/audio

experiment_name: piq
output_folder: ./results/piq/1234
save_folder: ./results/piq/1234/save
train_log: ./results/piq/1234/train_log.txt

test_only: false
save_interpretations: true
interpret_period: 10

# Tensorboard logs
use_tensorboard: false
tensorboard_logs_folder: ./results/piq/1234/tb_logs/

# Path where data manifest files will be stored
train_annotation: /data2/ESC-50-master/manifest/train.json
valid_annotation: /data2/ESC-50-master/manifest/valid.json
test_annotation: /data2/ESC-50-master/manifest/test.json

# To standardize results, UrbanSound8k has pre-separated samples into
# 10 folds for multi-fold validation
train_fold_nums: [1, 2, 3]
valid_fold_nums: [4]
test_fold_nums: [5]
skip_manifest_creation: false

ckpt_interval_minutes: 15 # save checkpoint every N min

# Training parameters
number_of_epochs: 200
batch_size: 16
lr: 0.0002
sample_rate: 16000
use_vq: true
rec_loss_coef: 1
use_mask_output: true
mask_th: 0.35

device: cuda

# Feature parameters
n_mels: 80

# Number of classes
out_n_neurons: 50

shuffle: true
dataloader_options:
  batch_size: 16
  shuffle: true
  num_workers: 0

epoch_counter: &id001 !new:speechbrain.utils.epoch_loop.EpochCounter

  limit: 200

opt_class: !name:torch.optim.Adam
  lr: 0.0002
  weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
  factor: 0.5
  patience: 3
  dont_halve_until_epoch: 100

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: ./results/piq/1234/train_log.txt

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: ./results/piq/1234/save
  recoverables:
    psi_model: &id004 !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio
      dim: 256
      K: 1024
      shared_keys: 0
      activate_class_partitioning: true
      use_adapter: true
      adapter_reduce_dim: true

    counter: *id001
use_pretrained: true

# embedding_model: !new:custom_models.Conv2dEncoder_v2
embedding_model: &id002 !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
  dim: 256

classifier: &id003 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
  input_size: 256
  out_neurons: 50
  lin_blocks: 1


# Interpretation hyperparams
K: 1024

# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: &id005 !new:speechbrain.processing.features.STFT
  n_fft: 1024
  hop_length: 11.6099
  win_length: 23.2199
  sample_rate: 16000

compute_fbank: &id006 !new:speechbrain.processing.features.Filterbank
  n_mels: 80
  n_fft: 1024
  sample_rate: 16000

compute_istft: &id007 !new:speechbrain.processing.features.ISTFT
  sample_rate: 16000
  hop_length: 11.6099
  win_length: 23.2199

label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
psi_model: *id004
modules:
  compute_stft: *id005
  compute_fbank: *id006
  compute_istft: *id007
  psi: *id004
  embedding_model: !ref <embedding_model>
  classifier: !ref <classifier>

embedding_model_path: fpaissan/conv2d_us8k/embedding_modelft.ckpt
classifier_model_path: fpaissan/conv2d_us8k/classifier.ckpt
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>
    psi: !ref <psi_model>
    label_encoder: !ref <label_encoder>
  paths:
    embedding_model: fpaissan/conv2d_us8k/embedding_modelft.ckpt
    classifier: fpaissan/conv2d_us8k/classifier.ckpt
    psi: /data2/PIQ-ESC50/psi_model.ckpt
    label_encoder: speechbrain/cnn14-esc50/label_encoder.txt