|
name: &name "TitaNet-Finetune" |
|
sample_rate: &sample_rate 16000 |
|
|
|
init_from_pretrained_model: |
|
speaker_tasks: |
|
name: 'titanet_large' |
|
include: ["preprocessor","encoder"] |
|
exclude: ["decoder.final"] |
|
|
|
model: |
|
train_ds: |
|
manifest_filepath: ??? |
|
sample_rate: 16000 |
|
labels: null |
|
batch_size: 64 |
|
shuffle: True |
|
is_tarred: False |
|
tarred_audio_filepaths: null |
|
tarred_shard_strategy: "scatter" |
|
augmentor: |
|
speed: |
|
prob: 0.3 |
|
sr: *sample_rate |
|
resample_type: 'kaiser_fast' |
|
min_speed_rate: 0.95 |
|
max_speed_rate: 1.05 |
|
|
|
validation_ds: |
|
manifest_filepath: ??? |
|
sample_rate: 16000 |
|
labels: null |
|
batch_size: 128 |
|
shuffle: False |
|
|
|
test_ds: |
|
manifest_filepath: ??? |
|
sample_rate: 16000 |
|
labels: null |
|
batch_size: 1 |
|
shuffle: False |
|
embedding_dir: './embeddings' |
|
|
|
model_defaults: |
|
filters: 1024 |
|
repeat: 3 |
|
dropout: 0.1 |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
kernel_size_factor: 1.0 |
|
|
|
preprocessor: |
|
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
|
normalize: "per_feature" |
|
window_size: 0.025 |
|
sample_rate: *sample_rate |
|
window_stride: 0.01 |
|
window: "hann" |
|
features: &n_mels 80 |
|
n_fft: 512 |
|
frame_splicing: 1 |
|
dither: 0.00001 |
|
|
|
encoder: |
|
_target_: nemo.collections.asr.modules.ConvASREncoder |
|
feat_in: *n_mels |
|
activation: relu |
|
conv_mask: true |
|
|
|
jasper: |
|
- filters: ${model.model_defaults.filters} |
|
repeat: 1 |
|
kernel: [3] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: 0.0 |
|
residual: false |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [7] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [11] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
|
|
- filters: ${model.model_defaults.filters} |
|
repeat: ${model.model_defaults.repeat} |
|
kernel: [15] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: ${model.model_defaults.dropout} |
|
residual: true |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
|
|
- filters: &enc_feat_out 3072 |
|
repeat: 1 |
|
kernel: [1] |
|
stride: [1] |
|
dilation: [1] |
|
dropout: 0.0 |
|
residual: false |
|
separable: ${model.model_defaults.separable} |
|
se: ${model.model_defaults.se} |
|
se_context_size: ${model.model_defaults.se_context_size} |
|
|
|
decoder: |
|
_target_: nemo.collections.asr.modules.SpeakerDecoder |
|
feat_in: *enc_feat_out |
|
num_classes: ??? |
|
pool_mode: 'attention' |
|
emb_sizes: 192 |
|
|
|
loss: |
|
_target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss |
|
scale: 30 |
|
margin: 0.2 |
|
|
|
optim_param_groups: |
|
encoder: |
|
lr: .001 |
|
|
|
optim: |
|
name: adamw |
|
lr: .0001 |
|
weight_decay: 0.0002 |
|
|
|
|
|
sched: |
|
name: CosineAnnealing |
|
warmup_ratio: 0.1 |
|
min_lr: 0.0 |
|
|
|
trainer: |
|
devices: 1 |
|
max_epochs: 10 |
|
max_steps: -1 |
|
num_nodes: 1 |
|
accelerator: gpu |
|
strategy: ddp |
|
deterministic: True |
|
enable_checkpointing: False |
|
logger: False |
|
log_every_n_steps: 1 |
|
val_check_interval: 1.0 |
|
gradient_clip_val: 1.0 |
|
|
|
exp_manager: |
|
exp_dir: null |
|
name: *name |
|
create_tensorboard_logger: True |
|
create_checkpoint_callback: True |
|
|