File size: 2,869 Bytes
7ffa28e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
train_ds:
  manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
  sample_rate: 16000
  labels: null
  batch_size: 64
  shuffle: true
  time_length: 3
  is_tarred: false
  tarred_audio_filepaths: null
  tarred_shard_strategy: scatter
  augmentor:
    noise:
      manifest_path: /manifests/noise/rir_noise_manifest.json
      prob: 0.5
      min_snr_db: 0
      max_snr_db: 15
    speed:
      prob: 0.5
      sr: 16000
      resample_type: kaiser_fast
      min_speed_rate: 0.95
      max_speed_rate: 1.05
  num_workers: 15
  pin_memory: true
validation_ds:
  manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json
  sample_rate: 16000
  labels: null
  batch_size: 128
  shuffle: false
  time_length: 3
  num_workers: 15
  pin_memory: true
model_defaults:
  filters: 1024
  repeat: 3
  dropout: 0.1
  separable: true
  se: true
  se_context_size: -1
  kernel_size_factor: 1.0
  enc_hidden: 640
  pred_hidden: 640
  joint_hidden: 640
preprocessor:
  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
  normalize: per_feature
  window_size: 0.025
  sample_rate: 16000
  window_stride: 0.01
  window: hann
  features: 80
  n_fft: 512
  frame_splicing: 1
  dither: 1.0e-05
spec_augment:
  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
  freq_masks: 3
  freq_width: 4
  time_masks: 5
  time_width: 0.03
encoder:
  _target_: nemo.collections.asr.modules.ConvASREncoder
  feat_in: 80
  activation: relu
  conv_mask: true
  jasper:
  - filters: 1024
    repeat: 1
    kernel:
    - 3
    stride:
    - 1
    dilation:
    - 1
    dropout: 0.0
    residual: false
    separable: true
    se: true
    se_context_size: -1
  - filters: 1024
    repeat: 3
    kernel:
    - 7
    stride:
    - 1
    dilation:
    - 1
    dropout: 0.1
    residual: true
    separable: true
    se: true
    se_context_size: -1
  - filters: 1024
    repeat: 3
    kernel:
    - 11
    stride:
    - 1
    dilation:
    - 1
    dropout: 0.1
    residual: true
    separable: true
    se: true
    se_context_size: -1
  - filters: 1024
    repeat: 3
    kernel:
    - 15
    stride:
    - 1
    dilation:
    - 1
    dropout: 0.1
    residual: true
    separable: true
    se: true
    se_context_size: -1
  - filters: 3072
    repeat: 1
    kernel:
    - 1
    stride:
    - 1
    dilation:
    - 1
    dropout: 0.0
    residual: false
    separable: true
    se: true
    se_context_size: -1
decoder:
  _target_: nemo.collections.asr.modules.SpeakerDecoder
  feat_in: 3072
  num_classes: 16681
  pool_mode: attention
  emb_sizes: 192
  angular: true
loss:
  scale: 30
  margin: 0.2
optim:
  name: sgd
  lr: 0.08
  weight_decay: 0.0002
  sched:
    name: CosineAnnealing
    warmup_ratio: 0.1
    min_lr: 0.0
  momentum: 0.9
target: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel