File size: 2,866 Bytes
7786141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
decoder:
  _target_: nemo.collections.asr.modules.SpeakerDecoder
  angular: false
  emb_sizes: 192
  feat_in: 3072
  num_classes: 2
  pool_mode: attention
encoder:
  _target_: nemo.collections.asr.modules.ConvASREncoder
  activation: relu
  conv_mask: true
  feat_in: 80
  jasper:
  - dilation:
    - 1
    dropout: 0.0
    filters: 1024
    kernel:
    - 3
    repeat: 1
    residual: false
    se: true
    se_context_size: -1
    separable: true
    stride:
    - 1
  - dilation:
    - 1
    dropout: 0.1
    filters: 1024
    kernel:
    - 7
    repeat: 3
    residual: true
    se: true
    se_context_size: -1
    separable: true
    stride:
    - 1
  - dilation:
    - 1
    dropout: 0.1
    filters: 1024
    kernel:
    - 11
    repeat: 3
    residual: true
    se: true
    se_context_size: -1
    separable: true
    stride:
    - 1
  - dilation:
    - 1
    dropout: 0.1
    filters: 1024
    kernel:
    - 15
    repeat: 3
    residual: true
    se: true
    se_context_size: -1
    separable: true
    stride:
    - 1
  - dilation:
    - 1
    dropout: 0.0
    filters: 3072
    kernel:
    - 1
    repeat: 1
    residual: false
    se: true
    se_context_size: -1
    separable: true
    stride:
    - 1
loss:
  margin: 0.2
  scale: 30
model_defaults:
  dropout: 0.1
  enc_hidden: 640
  filters: 1024
  joint_hidden: 640
  kernel_size_factor: 1.0
  pred_hidden: 640
  repeat: 3
  se: true
  se_context_size: -1
  separable: true
optim:
  lr: 0.08
  momentum: 0.9
  name: sgd
  sched:
    min_lr: 0.0
    name: CosineAnnealing
    warmup_ratio: 0.1
  weight_decay: 0.0002
preprocessor:
  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
  dither: 1.0e-05
  features: 80
  frame_splicing: 1
  n_fft: 512
  normalize: per_feature
  sample_rate: 16000
  window: hann
  window_size: 0.025
  window_stride: 0.01
spec_augment:
  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
  freq_masks: 3
  freq_width: 4
  time_masks: 5
  time_width: 0.03
target: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel
train_ds:
  augmentor:
    noise:
      manifest_path: /manifests/noise/rir_noise_manifest.json
      max_snr_db: 15
      min_snr_db: 0
      prob: 0.5
    speed:
      max_speed_rate: 1.05
      min_speed_rate: 0.95
      prob: 0.5
      resample_type: kaiser_fast
      sr: 16000
  batch_size: 64
  is_tarred: false
  labels: null
  manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
  num_workers: 15
  pin_memory: true
  sample_rate: 16000
  shuffle: true
  tarred_audio_filepaths: null
  tarred_shard_strategy: scatter
  time_length: 3
validation_ds:
  batch_size: 128
  labels: null
  manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json
  num_workers: 15
  pin_memory: true
  sample_rate: 16000
  shuffle: false
  time_length: 3