# ################################# # The recipe for training PIQ on the ESC50 dataset. # # Author: # * Cem Subakan 2022, 2023 # * Francesco Paissan 2022, 2023 # (based on the SpeechBrain UrbanSound8k recipe) # ################################# sample_rate: 16000 use_vq: true rec_loss_coef: 1 use_mask_output: true mask_th: 0.35 device: cpu # Feature parameters n_mels: 80 # Number of classes out_n_neurons: 50 # embedding_model: !new:custom_models.Conv2dEncoder_v2 embedding_model: !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2 dim: 256 classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier input_size: 256 out_neurons: 50 lin_blocks: 1 # Interpretation hyperparams K: 1024 # pre-processing n_fft: 1024 spec_mag_power: 0.5 hop_length: 11.6099 win_length: 23.2199 compute_stft: !new:speechbrain.processing.features.STFT n_fft: 1024 hop_length: 11.6099 win_length: 23.2199 sample_rate: 16000 compute_fbank: !new:speechbrain.processing.features.Filterbank n_mels: 80 n_fft: 1024 sample_rate: 16000 compute_istft: !new:speechbrain.processing.features.ISTFT sample_rate: 16000 hop_length: 11.6099 win_length: 23.2199 label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder psi_model: !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio dim: 256 K: 1024 shared_keys: 0 activate_class_partitioning: true use_adapter: true adapter_reduce_dim: true modules: compute_stft: !ref compute_fbank: !ref compute_istft: !ref psi: !ref embedding_model: !ref classifier: !ref pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: embedding_model: !ref classifier: !ref psi: !ref label_encoder: !ref paths: embedding_model: speechbrain/PIQ-ESC50/embedding_modelft.ckpt classifier: speechbrain/PIQ-ESC50/classifier.ckpt psi: speechbrain/PIQ-ESC50/psi_model.ckpt label_encoder: speechbrain/cnn14-esc50/label_encoder.txt