File size: 2,869 Bytes
7ffa28e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
train_ds:
manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
sample_rate: 16000
labels: null
batch_size: 64
shuffle: true
time_length: 3
is_tarred: false
tarred_audio_filepaths: null
tarred_shard_strategy: scatter
augmentor:
noise:
manifest_path: /manifests/noise/rir_noise_manifest.json
prob: 0.5
min_snr_db: 0
max_snr_db: 15
speed:
prob: 0.5
sr: 16000
resample_type: kaiser_fast
min_speed_rate: 0.95
max_speed_rate: 1.05
num_workers: 15
pin_memory: true
validation_ds:
manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json
sample_rate: 16000
labels: null
batch_size: 128
shuffle: false
time_length: 3
num_workers: 15
pin_memory: true
model_defaults:
filters: 1024
repeat: 3
dropout: 0.1
separable: true
se: true
se_context_size: -1
kernel_size_factor: 1.0
enc_hidden: 640
pred_hidden: 640
joint_hidden: 640
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: per_feature
window_size: 0.025
sample_rate: 16000
window_stride: 0.01
window: hann
features: 80
n_fft: 512
frame_splicing: 1
dither: 1.0e-05
spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
freq_masks: 3
freq_width: 4
time_masks: 5
time_width: 0.03
encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
feat_in: 80
activation: relu
conv_mask: true
jasper:
- filters: 1024
repeat: 1
kernel:
- 3
stride:
- 1
dilation:
- 1
dropout: 0.0
residual: false
separable: true
se: true
se_context_size: -1
- filters: 1024
repeat: 3
kernel:
- 7
stride:
- 1
dilation:
- 1
dropout: 0.1
residual: true
separable: true
se: true
se_context_size: -1
- filters: 1024
repeat: 3
kernel:
- 11
stride:
- 1
dilation:
- 1
dropout: 0.1
residual: true
separable: true
se: true
se_context_size: -1
- filters: 1024
repeat: 3
kernel:
- 15
stride:
- 1
dilation:
- 1
dropout: 0.1
residual: true
separable: true
se: true
se_context_size: -1
- filters: 3072
repeat: 1
kernel:
- 1
stride:
- 1
dilation:
- 1
dropout: 0.0
residual: false
separable: true
se: true
se_context_size: -1
decoder:
_target_: nemo.collections.asr.modules.SpeakerDecoder
feat_in: 3072
num_classes: 16681
pool_mode: attention
emb_sizes: 192
angular: true
loss:
scale: 30
margin: 0.2
optim:
name: sgd
lr: 0.08
weight_decay: 0.0002
sched:
name: CosineAnnealing
warmup_ratio: 0.1
min_lr: 0.0
momentum: 0.9
target: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel
|