|
|
End-to-End Speaker Diarization Configuration Files |
|
|
================================================== |
|
|
|
|
|
Hydra Configurations for Sortformer Diarizer Training |
|
|
----------------------------------------------------- |
|
|
|
|
|
Sortformer Diarizer is an end-to-end speaker diarization model that is solely based on Transformer-encoder type of architecture. |
|
|
Model name convention for Sortformer Diarizer: sortformer_diarizer_<loss_type>_<speaker count limit>-<version>.yaml |
|
|
|
|
|
|
|
|
* Example `<NeMo_root>/examples/speaker_tasks/diarization/neural_diarizer/conf/sortformer_diarizer_hybrid_loss_4spk-v1.yaml`. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
name: "SortFormerDiarizer" |
|
|
num_workers: 18 |
|
|
batch_size: 8 |
|
|
|
|
|
model: |
|
|
sample_rate: 16000 |
|
|
pil_weight: 0.5 |
|
|
ats_weight: 0.5 |
|
|
max_num_of_spks: 4 |
|
|
|
|
|
model_defaults: |
|
|
fc_d_model: 512 |
|
|
tf_d_model: 192 |
|
|
|
|
|
train_ds: |
|
|
manifest_filepath: ??? |
|
|
sample_rate: ${model.sample_rate} |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
session_len_sec: 90 |
|
|
soft_label_thres: 0.5 |
|
|
soft_targets: False |
|
|
labels: null |
|
|
batch_size: ${batch_size} |
|
|
shuffle: True |
|
|
num_workers: ${num_workers} |
|
|
validation_mode: False |
|
|
|
|
|
use_lhotse: False |
|
|
use_bucketing: True |
|
|
num_buckets: 10 |
|
|
bucket_duration_bins: [10, 20, 30, 40, 50, 60, 70, 80, 90] |
|
|
pin_memory: True |
|
|
min_duration: 10 |
|
|
max_duration: 90 |
|
|
batch_duration: 400 |
|
|
quadratic_duration: 1200 |
|
|
bucket_buffer_size: 20000 |
|
|
shuffle_buffer_size: 10000 |
|
|
window_stride: ${model.preprocessor.window_stride} |
|
|
subsampling_factor: ${model.encoder.subsampling_factor} |
|
|
|
|
|
validation_ds: |
|
|
manifest_filepath: ??? |
|
|
is_tarred: False |
|
|
tarred_audio_filepaths: null |
|
|
sample_rate: ${model.sample_rate} |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
session_len_sec: 90 |
|
|
soft_label_thres: 0.5 |
|
|
soft_targets: False |
|
|
labels: null |
|
|
batch_size: ${batch_size} |
|
|
shuffle: False |
|
|
num_workers: ${num_workers} |
|
|
validation_mode: True |
|
|
|
|
|
use_lhotse: False |
|
|
use_bucketing: False |
|
|
drop_last: False |
|
|
pin_memory: True |
|
|
window_stride: ${model.preprocessor.window_stride} |
|
|
subsampling_factor: ${model.encoder.subsampling_factor} |
|
|
|
|
|
test_ds: |
|
|
manifest_filepath: null |
|
|
is_tarred: False |
|
|
tarred_audio_filepaths: null |
|
|
sample_rate: 16000 |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
session_len_sec: 90 |
|
|
soft_label_thres: 0.5 |
|
|
soft_targets: False |
|
|
labels: null |
|
|
batch_size: ${batch_size} |
|
|
shuffle: False |
|
|
seq_eval_mode: True |
|
|
num_workers: ${num_workers} |
|
|
validation_mode: True |
|
|
|
|
|
use_lhotse: False |
|
|
use_bucketing: False |
|
|
drop_last: False |
|
|
pin_memory: True |
|
|
window_stride: ${model.preprocessor.window_stride} |
|
|
subsampling_factor: ${model.encoder.subsampling_factor} |
|
|
|
|
|
preprocessor: |
|
|
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
|
|
normalize: "per_feature" |
|
|
window_size: 0.025 |
|
|
sample_rate: ${model.sample_rate} |
|
|
window_stride: 0.01 |
|
|
window: "hann" |
|
|
features: 80 |
|
|
n_fft: 512 |
|
|
frame_splicing: 1 |
|
|
dither: 0.00001 |
|
|
|
|
|
sortformer_modules: |
|
|
_target_: nemo.collections.asr.modules.sortformer_modules.SortformerModules |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
dropout_rate: 0.5 |
|
|
fc_d_model: ${model.model_defaults.fc_d_model} |
|
|
tf_d_model: ${model.model_defaults.tf_d_model} |
|
|
|
|
|
encoder: |
|
|
_target_: nemo.collections.asr.modules.ConformerEncoder |
|
|
feat_in: ${model.preprocessor.features} |
|
|
feat_out: -1 |
|
|
n_layers: 18 |
|
|
d_model: ${model.model_defaults.fc_d_model} |
|
|
|
|
|
|
|
|
subsampling: dw_striding |
|
|
subsampling_factor: 8 |
|
|
subsampling_conv_channels: 256 |
|
|
causal_downsampling: false |
|
|
|
|
|
ff_expansion_factor: 4 |
|
|
|
|
|
self_attention_model: rel_pos |
|
|
n_heads: 8 |
|
|
|
|
|
att_context_size: [-1, -1] |
|
|
att_context_style: regular |
|
|
xscaling: true |
|
|
untie_biases: true |
|
|
pos_emb_max_len: 5000 |
|
|
|
|
|
conv_kernel_size: 9 |
|
|
conv_norm_type: 'batch_norm' |
|
|
conv_context_size: null |
|
|
|
|
|
dropout: 0.1 |
|
|
dropout_pre_encoder: 0.1 |
|
|
dropout_emb: 0.0 |
|
|
dropout_att: 0.1 |
|
|
|
|
|
stochastic_depth_drop_prob: 0.0 |
|
|
stochastic_depth_mode: linear |
|
|
stochastic_depth_start_layer: 1 |
|
|
|
|
|
transformer_encoder: |
|
|
_target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder |
|
|
num_layers: 18 |
|
|
hidden_size: ${model.model_defaults.tf_d_model} |
|
|
inner_size: 768 |
|
|
num_attention_heads: 8 |
|
|
attn_score_dropout: 0.5 |
|
|
attn_layer_dropout: 0.5 |
|
|
ffn_dropout: 0.5 |
|
|
hidden_act: relu |
|
|
pre_ln: False |
|
|
pre_ln_final_layer_norm: True |
|
|
|
|
|
loss: |
|
|
_target_: nemo.collections.asr.losses.bce_loss.BCELoss |
|
|
weight: null |
|
|
reduction: mean |
|
|
|
|
|
lr: 0.0001 |
|
|
optim: |
|
|
name: adamw |
|
|
lr: ${model.lr} |
|
|
|
|
|
betas: [0.9, 0.98] |
|
|
weight_decay: 1e-3 |
|
|
|
|
|
sched: |
|
|
name: InverseSquareRootAnnealing |
|
|
warmup_steps: 2500 |
|
|
warmup_ratio: null |
|
|
min_lr: 1e-06 |
|
|
|
|
|
trainer: |
|
|
devices: 1 |
|
|
accelerator: gpu |
|
|
max_epochs: 800 |
|
|
max_steps: -1 |
|
|
num_nodes: 1 |
|
|
strategy: ddp_find_unused_parameters_true |
|
|
accumulate_grad_batches: 1 |
|
|
deterministic: True |
|
|
enable_checkpointing: False |
|
|
logger: False |
|
|
log_every_n_steps: 1 |
|
|
val_check_interval: 1.0 |
|
|
|
|
|
exp_manager: |
|
|
use_datetime_version: False |
|
|
exp_dir: null |
|
|
name: ${name} |
|
|
resume_if_exists: True |
|
|
resume_from_checkpoint: null |
|
|
resume_ignore_no_checkpoint: True |
|
|
create_tensorboard_logger: True |
|
|
create_checkpoint_callback: True |
|
|
create_wandb_logger: False |
|
|
checkpoint_callback_params: |
|
|
monitor: "val_f1_acc" |
|
|
mode: "max" |
|
|
save_top_k: 9 |
|
|
every_n_epochs: 1 |
|
|
wandb_logger_kwargs: |
|
|
resume: True |
|
|
name: null |
|
|
project: null |
|
|
|
|
|
Hydra Configurations for Sortformer Diarization Post-processing |
|
|
--------------------------------------------------------------- |
|
|
|
|
|
Post-processing converts the floating point number based Tensor output to time stamp output. While generating the speaker-homogeneous segments, onset and offset threshold, |
|
|
paddings can be considered to render the time stamps that can lead to the lowest diarization error rate (DER). |
|
|
|
|
|
|
|
|
By default, post-processing is bypassed, and only binarization is performed. If you want to reproduce DER scores reported on NeMo model cards, you need to apply post-processing steps. Use batch_size = 1 to have the longest inference window and the highest possible accuracy. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
parameters: |
|
|
onset: 0.64 |
|
|
offset: 0.74 |
|
|
pad_onset: 0.06 |
|
|
pad_offset: 0.0 |
|
|
min_duration_on: 0.1 |
|
|
min_duration_off: 0.15 |
|
|
|
|
|
|
|
|
Cascaded Speaker Diarization Configuration Files |
|
|
================================================ |
|
|
|
|
|
Both training and inference of cascaded speaker diarization is configured by ``.yaml`` files. The diarizer section will generally require information about the dataset(s) being used, models used in this pipeline, as well as inference related parameters such as post processing of each models. The sections on this page cover each of these in more detail. |
|
|
|
|
|
.. note:: |
|
|
For model details and deep understanding about configs, training, fine-tuning and evaluations, |
|
|
please refer to ``<NeMo_root>/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb`` and ``<NeMo_root>/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb``; |
|
|
for other applications such as possible integration with ASR, have a look at ``<NeMo_root>/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb``. |
|
|
|
|
|
|
|
|
Hydra Configurations for Diarization Training |
|
|
--------------------------------------------- |
|
|
|
|
|
Currently, NeMo supports Multi-scale diarization decoder (MSDD) as a neural diarizer model. MSDD is a speaker diarization model based on initializing clustering and multi-scale segmentation input. Example configuration files for MSDD model training can be found in ``<NeMo_root>/examples/speaker_tasks/diarization/conf/neural_diarizer/``. |
|
|
|
|
|
* Model name convention for MSDD: msdd_<number of scales>scl_<longest scale in decimal second (ds)>_<shortest scale in decimal second (ds)>_<overlap percentage of window shifting>Povl_<hidden layer size>x<number of LSTM layers>x<number of CNN output channels>x<repetition count of conv layer> |
|
|
* Example: ``msdd_5scl_15_05_50Povl_256x3x32x2.yaml`` has 5 scales, the longest scale is 1.5 sec, the shortest scale is 0.5 sec, with 50 percent overlap, hidden layer size is 256, 3 LSTM layers, 32 CNN channels, 2 repeated Conv layers |
|
|
|
|
|
MSDD model checkpoint (.ckpt) and NeMo file (.nemo) contain speaker embedding model (TitaNet) and the speaker model is loaded along with standalone MSDD module. Note that MSDD models require more than one scale. Thus, the parameters in ``diarizer.speaker_embeddings.parameters`` should have more than one scale to function as a MSDD model. |
|
|
|
|
|
|
|
|
General Diarizer Configuration |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
The items (OmegaConfig keys) directly under ``model`` determines segmentation and clustering related parameters. Multi-scale parameters (``window_length_in_sec``, ``shift_length_in_sec`` and ``multiscale_weights``) are specified. ``max_num_of_spks``, ``scale_n``, ``soft_label_thres`` and ``emb_batch_size`` are set here and then assigned to dataset configurations. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
diarizer: |
|
|
out_dir: null |
|
|
oracle_vad: True |
|
|
speaker_embeddings: |
|
|
model_path: ??? |
|
|
parameters: |
|
|
window_length_in_sec: [1.5,1.25,1.0,0.75,0.5] |
|
|
shift_length_in_sec: [0.75,0.625,0.5,0.375,0.25] |
|
|
multiscale_weights: [1,1,1,1,1] |
|
|
save_embeddings: True |
|
|
|
|
|
|
|
|
num_workers: ${num_workers} |
|
|
max_num_of_spks: 2 |
|
|
scale_n: 5 |
|
|
soft_label_thres: 0.5 |
|
|
emb_batch_size: 0 |
|
|
|
|
|
Dataset Configuration |
|
|
^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
Training, validation, and test parameters are specified using the ``train_ds``, ``validation_ds``, and |
|
|
``test_ds`` sections in the configuration YAML file, respectively. The items such as ``num_spks``, ``soft_label_thres`` and ``emb_batch_size`` follow the settings in ``model`` key. You may also leave fields such as the ``manifest_filepath`` or ``emb_dir`` blank, and then specify it via command-line interface. Note that ``test_ds`` is not used during training and only used for speaker diarization inference. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
train_ds: |
|
|
manifest_filepath: ??? |
|
|
emb_dir: ??? |
|
|
sample_rate: ${sample_rate} |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
soft_label_thres: ${model.soft_label_thres} |
|
|
labels: null |
|
|
batch_size: ${batch_size} |
|
|
emb_batch_size: ${model.emb_batch_size} |
|
|
shuffle: True |
|
|
|
|
|
validation_ds: |
|
|
manifest_filepath: ??? |
|
|
emb_dir: ??? |
|
|
sample_rate: ${sample_rate} |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
soft_label_thres: ${model.soft_label_thres} |
|
|
labels: null |
|
|
batch_size: 2 |
|
|
emb_batch_size: ${model.emb_batch_size} |
|
|
shuffle: False |
|
|
|
|
|
test_ds: |
|
|
manifest_filepath: null |
|
|
emb_dir: null |
|
|
sample_rate: 16000 |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
soft_label_thres: ${model.soft_label_thres} |
|
|
labels: null |
|
|
batch_size: 2 |
|
|
shuffle: False |
|
|
seq_eval_mode: False |
|
|
|
|
|
|
|
|
Pre-processor Configuration |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
In the MSDD configuration, pre-processor configuration follows the pre-processor of the embedding extractor model. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
preprocessor: |
|
|
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
|
|
normalize: "per_feature" |
|
|
window_size: 0.025 |
|
|
sample_rate: ${sample_rate} |
|
|
window_stride: 0.01 |
|
|
window: "hann" |
|
|
features: 80 |
|
|
n_fft: 512 |
|
|
frame_splicing: 1 |
|
|
dither: 0.00001 |
|
|
|
|
|
|
|
|
Model Architecture Configurations |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
The hyper-parameters for MSDD models are under the ``msdd_module`` key. The model architecture can be changed by setting up the ``weighting_scheme`` and ``context_vector_type``. The detailed explanation for architecture can be found in the :doc:`Models <./models>` page. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
msdd_module: |
|
|
_target_: nemo.collections.asr.modules.msdd_diarizer.MSDD_module |
|
|
num_spks: ${model.max_num_of_spks} |
|
|
hidden_size: 256 |
|
|
num_lstm_layers: 3 |
|
|
dropout_rate: 0.5 |
|
|
cnn_output_ch: 32 |
|
|
conv_repeat: 2 |
|
|
emb_dim: 192 |
|
|
scale_n: ${model.scale_n} |
|
|
weighting_scheme: 'conv_scale_weight' |
|
|
context_vector_type: 'cos_sim' |
|
|
|
|
|
Loss Configurations |
|
|
^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
Neural diarizer uses a binary cross entropy (BCE) loss. A set of weights for negative (absence of the speaker's speech) and positive (presence of the speaker's speech) can be provided to the loss function. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
loss: |
|
|
_target_: nemo.collections.asr.losses.bce_loss.BCELoss |
|
|
weight: null |
|
|
|
|
|
|
|
|
Hydra Configurations for Diarization Inference |
|
|
============================================== |
|
|
|
|
|
Example configuration files for speaker diarization inference can be found in ``<NeMo_root>/examples/speaker_tasks/diarization/conf/inference/``. Choose a yaml file that fits your targeted domain. For example, if you want to diarize audio recordings of telephonic speech, choose ``diar_infer_telephonic.yaml``. |
|
|
|
|
|
The configurations for all the components of diarization inference are included in a single file named ``diar_infer_<domain>.yaml``. Each ``.yaml`` file has a few different sections for the following modules: VAD, Speaker Embedding, Clustering and ASR. |
|
|
|
|
|
In speaker diarization inference, the datasets provided in manifest format denote the data that you would like to perform speaker diarization on. |
|
|
|
|
|
Diarizer Configurations |
|
|
----------------------- |
|
|
|
|
|
An example ``diarizer`` Hydra configuration could look like: |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
diarizer: |
|
|
manifest_filepath: ??? |
|
|
out_dir: ??? |
|
|
oracle_vad: False |
|
|
collar: 0.25 |
|
|
ignore_overlap: True |
|
|
|
|
|
Under ``diarizer`` key, there are ``vad``, ``speaker_embeddings``, ``clustering`` and ``asr`` keys containing configurations for the inference of the corresponding modules. |
|
|
|
|
|
Configurations for Voice Activity Detector |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
Parameters for VAD model are provided as in the following Hydra config example. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
vad: |
|
|
model_path: null |
|
|
external_vad_manifest: null |
|
|
|
|
|
parameters: |
|
|
window_length_in_sec: 0.15 |
|
|
shift_length_in_sec: 0.01 |
|
|
smoothing: "median" |
|
|
overlap: 0.875 |
|
|
onset: 0.4 |
|
|
offset: 0.7 |
|
|
pad_onset: 0.05 |
|
|
pad_offset: -0.1 |
|
|
min_duration_on: 0.2 |
|
|
min_duration_off: 0.2 |
|
|
filter_speech_first: True |
|
|
|
|
|
Configurations for Speaker Embedding in Diarization |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
Parameters for speaker embedding model are provided in the following Hydra config example. Note that multiscale parameters either accept list or single floating point number. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
speaker_embeddings: |
|
|
model_path: ??? |
|
|
parameters: |
|
|
window_length_in_sec: 1.5 |
|
|
shift_length_in_sec: 0.75 |
|
|
multiscale_weights: null |
|
|
save_embeddings: False |
|
|
|
|
|
Configurations for Clustering in Diarization |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
Parameters for clustering algorithm are provided in the following Hydra config example. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
clustering: |
|
|
parameters: |
|
|
oracle_num_speakers: False |
|
|
max_num_speakers: 20 |
|
|
enhanced_count_thres: 80 |
|
|
max_rp_threshold: 0.25 |
|
|
sparse_search_volume: 30 |
|
|
|
|
|
Configurations for Diarization with ASR |
|
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
|
|
|
|
The following configuration needs to be appended under ``diarizer`` to run ASR with diarization to get a transcription with speaker labels. |
|
|
|
|
|
.. code-block:: yaml |
|
|
|
|
|
asr: |
|
|
model_path: ??? |
|
|
parameters: |
|
|
asr_based_vad: False |
|
|
asr_based_vad_threshold: 50 |
|
|
asr_batch_size: null |
|
|
lenient_overlap_WDER: True |
|
|
decoder_delay_in_sec: null |
|
|
word_ts_anchor_offset: null |
|
|
word_ts_anchor_pos: "start" |
|
|
fix_word_ts_with_VAD: False |
|
|
colored_text: False |
|
|
print_time: True |
|
|
break_lines: False |
|
|
|
|
|
ctc_decoder_parameters: |
|
|
pretrained_language_model: null |
|
|
beam_width: 32 |
|
|
alpha: 0.5 |
|
|
beta: 2.5 |
|
|
|
|
|
realigning_lm_parameters: |
|
|
arpa_language_model: null |
|
|
min_number_of_words: 3 |
|
|
max_number_of_words: 10 |
|
|
logprob_diff_threshold: 1.2 |
|
|
|