|
train_ds: |
|
manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json |
|
sample_rate: 16000 |
|
labels: null |
|
batch_size: 64 |
|
shuffle: true |
|
time_length: 3 |
|
is_tarred: false |
|
tarred_audio_filepaths: null |
|
tarred_shard_strategy: scatter |
|
augmentor: |
|
noise: |
|
manifest_path: /manifests/noise/rir_noise_manifest.json |
|
prob: 0.5 |
|
min_snr_db: 0 |
|
max_snr_db: 15 |
|
speed: |
|
prob: 0.5 |
|
sr: 16000 |
|
resample_type: kaiser_fast |
|
min_speed_rate: 0.95 |
|
max_speed_rate: 1.05 |
|
num_workers: 15 |
|
pin_memory: true |
|
validation_ds: |
|
manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json |
|
sample_rate: 16000 |
|
labels: null |
|
batch_size: 128 |
|
shuffle: false |
|
time_length: 3 |
|
num_workers: 15 |
|
pin_memory: true |
|
model_defaults: |
|
filters: 1024 |
|
repeat: 3 |
|
dropout: 0.1 |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
kernel_size_factor: 1.0 |
|
enc_hidden: 640 |
|
pred_hidden: 640 |
|
joint_hidden: 640 |
|
preprocessor: |
|
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
|
normalize: per_feature |
|
window_size: 0.025 |
|
sample_rate: 16000 |
|
window_stride: 0.01 |
|
window: hann |
|
features: 80 |
|
n_fft: 512 |
|
frame_splicing: 1 |
|
dither: 1.0e-05 |
|
spec_augment: |
|
_target_: nemo.collections.asr.modules.SpectrogramAugmentation |
|
freq_masks: 3 |
|
freq_width: 4 |
|
time_masks: 5 |
|
time_width: 0.03 |
|
encoder: |
|
_target_: nemo.collections.asr.modules.ConvASREncoder |
|
feat_in: 80 |
|
activation: relu |
|
conv_mask: true |
|
jasper: |
|
- filters: 1024 |
|
repeat: 1 |
|
kernel: |
|
- 3 |
|
stride: |
|
- 1 |
|
dilation: |
|
- 1 |
|
dropout: 0.0 |
|
residual: false |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
- filters: 1024 |
|
repeat: 3 |
|
kernel: |
|
- 7 |
|
stride: |
|
- 1 |
|
dilation: |
|
- 1 |
|
dropout: 0.1 |
|
residual: true |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
- filters: 1024 |
|
repeat: 3 |
|
kernel: |
|
- 11 |
|
stride: |
|
- 1 |
|
dilation: |
|
- 1 |
|
dropout: 0.1 |
|
residual: true |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
- filters: 1024 |
|
repeat: 3 |
|
kernel: |
|
- 15 |
|
stride: |
|
- 1 |
|
dilation: |
|
- 1 |
|
dropout: 0.1 |
|
residual: true |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
- filters: 3072 |
|
repeat: 1 |
|
kernel: |
|
- 1 |
|
stride: |
|
- 1 |
|
dilation: |
|
- 1 |
|
dropout: 0.0 |
|
residual: false |
|
separable: true |
|
se: true |
|
se_context_size: -1 |
|
decoder: |
|
_target_: nemo.collections.asr.modules.SpeakerDecoder |
|
feat_in: 3072 |
|
num_classes: 16681 |
|
pool_mode: attention |
|
emb_sizes: 192 |
|
angular: true |
|
loss: |
|
scale: 30 |
|
margin: 0.2 |
|
optim: |
|
name: sgd |
|
lr: 0.08 |
|
weight_decay: 0.0002 |
|
sched: |
|
name: CosineAnnealing |
|
warmup_ratio: 0.1 |
|
min_lr: 0.0 |
|
momentum: 0.9 |
|
target: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel |
|
|