flexthink
Modified it to create as "lite" version for cases where you only need speaker embeddings
380887d
# ############################################################################ | |
# Model: ECAPA big for Speaker verification | |
# ############################################################################ | |
# Feature parameters | |
n_mels: 80 | |
# Pretrain folder (HuggingFace) | |
pretrained_path: poonehmousavi/discrete_wavlm_spk_rec_ecapatdn | |
# Output parameters | |
save_folder: tmp | |
### Configuration for discrete SSL model | |
# ssl_model_type: hubert, wavlm, wav2vec2 | |
# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large | |
ssl_model_type: wavlm # hubert, wavml or wav2vec2 | |
ssl_hub: microsoft/wavlm-large | |
ssl_folder: !ref <save_folder>/ssl_checkpoint | |
kmeans_repo_id: speechbrain/SSL_Quantization | |
kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint | |
kmeans_dataset: LibriSpeech-100-360-500 | |
freeze_ssl: True | |
freeze_feature_extractor: True | |
num_clusters: 1000 | |
### Config for Tokenizer | |
# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) | |
# ssl_layer_num: [3, 7, 12, 23] | |
# deduplicate: [False, False, False, False] | |
# bpe_tokenizer_path: [null , null, null, null] | |
ssl_layer_num: [1, 3, 7, 12, 18, 23] | |
ssl_layer_num_selected: [1, 3, 7, 12, 18, 23] | |
num_codebooks: 6 | |
deduplicate: [False, False, False, False, False, False] | |
bpe_tokenizer_path: [null, null, null, null, null, null] | |
sample_rate: 16000 | |
# Feature parameters | |
encoder_dim: 1024 | |
# Modules | |
tokenizer_config: | |
SSL_layers: !ref <ssl_layer_num> | |
deduplicates: !ref <deduplicate> | |
bpe_tokenizers: !ref <bpe_tokenizer_path> | |
discrete_embedding_layer: !new:custom_interface.Discrete_EmbeddingLayer | |
num_codebooks: !ref <num_codebooks> | |
vocab_size: !ref <num_clusters> | |
emb_dim: !ref <encoder_dim> | |
available_layers: !ref <ssl_layer_num> | |
layers: !ref <ssl_layer_num_selected> | |
attention_mlp: !new:custom_interface.AttentionMLP | |
input_dim: !ref <encoder_dim> | |
hidden_dim: !ref <encoder_dim> | |
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN | |
input_size: !ref <encoder_dim> | |
channels: [1024, 1024, 1024, 1024, 3072] | |
kernel_sizes: [5, 3, 3, 3, 1] | |
dilations: [1, 2, 3, 4, 1] | |
groups: [1, 1, 1, 1, 1] | |
attention_channels: 128 | |
lin_neurons: 192 | |
modules: | |
embedding_model: !ref <embedding_model> | |
attention_mlp: !ref <attention_mlp> | |
discrete_embedding_layer: !ref <discrete_embedding_layer> | |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
loadables: | |
embedding_model: !ref <embedding_model> | |
attention_mlp: !ref <attention_mlp> | |
discrete_embedding_layer: !ref <discrete_embedding_layer> | |
paths: | |
embedding_model: !ref <pretrained_path>/embedding_model.ckpt | |
attention_mlp: !ref <pretrained_path>/attention_mlp.ckpt | |
discrete_embedding_layer: !ref <pretrained_path>/discrete_embedding_layer.ckpt | |