voxlingua107-xls-r-300m-wav2vec / inference_wav2vec.yaml
kunnark's picture
Update inference_wav2vec.yaml
b80774a
############################# Inference ###################################################
# #################################
# Basic inference parameters for speaker-id. We have first a network that
# computes some embeddings. On the top of that, we employ a classifier.
#
# Author:
# * Mirco Ravanelli 2021
# * Kunnar Kukk 2022
# #################################
# pretrain folders:
pretrained_path: TalTechNLP/voxlingua107-xls-r-300m-wav2vec
# Model parameters
sample_rate: 16000
device: 'cpu'
# Feature extraction
compute_features: !new:speechbrain.lobes.features.Fbank
n_mels: 60
######################## Wav2Vec ########################
# URL for the wav2vec2 model.
wav2vec2_hub: facebook/wav2vec2-xls-r-300m
freeze_wav2vec: True
save_folder: ./save
wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
source: !ref <wav2vec2_hub>
output_norm: True
freeze: !ref <freeze_wav2vec>
save_path: !ref <save_folder>/wav2vec2_checkpoint
out_neurons: 107
classifier: !new:speechbrain.lobes.models.Xvector.Classifier
input_shape: [null, null, 2048]
activation: !name:torch.nn.LeakyReLU
lin_blocks: 1
lin_neurons: 512
out_neurons: !ref <out_neurons>
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
attentive: !new:speechbrain.lobes.models.ECAPA_TDNN.AttentiveStatisticsPooling
channels: 1024
attention_channels: 64
modules:
wav2vec2: !ref <wav2vec2>
compute_features: !ref <compute_features>
classifier: !ref <classifier>
attentive: !ref <attentive>
softmax: !ref <softmax>
model: !new:torch.nn.ModuleList
- [!ref <attentive>, !ref <classifier>]
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
wav2vec2: !ref <wav2vec2>
classifier: !ref <classifier>
label_encoder: !ref <label_encoder>
model: !ref <model>
paths:
wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
classifier: !ref <pretrained_path>/classifier.ckpt
label_encoder: !ref <pretrained_path>/label_encoder.txt
model: !ref <pretrained_path>/model.ckpt
##################
softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True