|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seed: 1234 |
|
__set_seed: !!python/object/apply:torch.manual_seed [1234] |
|
|
|
|
|
|
|
data_folder: /data2/ESC-50-master |
|
|
|
audio_data_folder: /data2/ESC-50-master/audio |
|
|
|
experiment_name: piq |
|
output_folder: ./results/piq/1234 |
|
save_folder: ./results/piq/1234/save |
|
train_log: ./results/piq/1234/train_log.txt |
|
|
|
test_only: false |
|
save_interpretations: true |
|
interpret_period: 10 |
|
|
|
|
|
use_tensorboard: false |
|
tensorboard_logs_folder: ./results/piq/1234/tb_logs/ |
|
|
|
|
|
train_annotation: /data2/ESC-50-master/manifest/train.json |
|
valid_annotation: /data2/ESC-50-master/manifest/valid.json |
|
test_annotation: /data2/ESC-50-master/manifest/test.json |
|
|
|
|
|
|
|
train_fold_nums: [1, 2, 3] |
|
valid_fold_nums: [4] |
|
test_fold_nums: [5] |
|
skip_manifest_creation: false |
|
|
|
ckpt_interval_minutes: 15 |
|
|
|
|
|
number_of_epochs: 200 |
|
batch_size: 16 |
|
lr: 0.0002 |
|
sample_rate: 16000 |
|
use_vq: true |
|
rec_loss_coef: 1 |
|
use_mask_output: true |
|
mask_th: 0.35 |
|
|
|
device: cuda |
|
|
|
|
|
n_mels: 80 |
|
|
|
|
|
out_n_neurons: 50 |
|
|
|
shuffle: true |
|
dataloader_options: |
|
batch_size: 16 |
|
shuffle: true |
|
num_workers: 0 |
|
|
|
epoch_counter: &id001 !new:speechbrain.utils.epoch_loop.EpochCounter |
|
|
|
limit: 200 |
|
|
|
opt_class: !name:torch.optim.Adam |
|
lr: 0.0002 |
|
weight_decay: 0.000002 |
|
|
|
lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau |
|
factor: 0.5 |
|
patience: 3 |
|
dont_halve_until_epoch: 100 |
|
|
|
|
|
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger |
|
save_file: ./results/piq/1234/train_log.txt |
|
|
|
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer |
|
checkpoints_dir: ./results/piq/1234/save |
|
recoverables: |
|
psi_model: &id004 !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio |
|
dim: 256 |
|
K: 1024 |
|
shared_keys: 0 |
|
activate_class_partitioning: true |
|
use_adapter: true |
|
adapter_reduce_dim: true |
|
|
|
counter: *id001 |
|
use_pretrained: true |
|
|
|
|
|
embedding_model: &id002 !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2 |
|
dim: 256 |
|
|
|
classifier: &id003 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier |
|
input_size: 256 |
|
out_neurons: 50 |
|
lin_blocks: 1 |
|
|
|
|
|
|
|
K: 1024 |
|
|
|
|
|
n_fft: 1024 |
|
spec_mag_power: 0.5 |
|
hop_length: 11.6099 |
|
win_length: 23.2199 |
|
compute_stft: &id005 !new:speechbrain.processing.features.STFT |
|
n_fft: 1024 |
|
hop_length: 11.6099 |
|
win_length: 23.2199 |
|
sample_rate: 16000 |
|
|
|
compute_fbank: &id006 !new:speechbrain.processing.features.Filterbank |
|
n_mels: 80 |
|
n_fft: 1024 |
|
sample_rate: 16000 |
|
|
|
compute_istft: &id007 !new:speechbrain.processing.features.ISTFT |
|
sample_rate: 16000 |
|
hop_length: 11.6099 |
|
win_length: 23.2199 |
|
|
|
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder |
|
psi_model: *id004 |
|
modules: |
|
compute_stft: *id005 |
|
compute_fbank: *id006 |
|
compute_istft: *id007 |
|
psi: *id004 |
|
embedding_model: !ref <embedding_model> |
|
classifier: !ref <classifier> |
|
|
|
embedding_model_path: fpaissan/conv2d_us8k/embedding_modelft.ckpt |
|
classifier_model_path: fpaissan/conv2d_us8k/classifier.ckpt |
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
embedding_model: !ref <embedding_model> |
|
classifier: !ref <classifier> |
|
psi: !ref <psi_model> |
|
label_encoder: !ref <label_encoder> |
|
paths: |
|
embedding_model: fpaissan/conv2d_us8k/embedding_modelft.ckpt |
|
classifier: fpaissan/conv2d_us8k/classifier.ckpt |
|
psi: /data2/PIQ-ESC50/psi_model.ckpt |
|
label_encoder: speechbrain/cnn14-esc50/label_encoder.txt |
|
|