NeonBohdan commited on
Commit
598e67e
1 Parent(s): 1e4ef2c

Updated hifi-V3 config

Browse files
Files changed (2) hide show
  1. config.json +21 -29
  2. speaker_ids.json +1 -1
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "distributed_url": "tcp://localhost:54321",
24
  "mixed_precision": true,
25
  "epochs": 1000,
26
- "batch_size": 16,
27
  "eval_batch_size": 4,
28
  "grad_clip": [
29
  1000,
@@ -45,7 +45,7 @@
45
  "use_grad_scaler": false,
46
  "cudnn_enable": true,
47
  "cudnn_deterministic": false,
48
- "cudnn_benchmark": true,
49
  "training_seed": 54321,
50
  "model": "vits",
51
  "num_loader_workers": 8,
@@ -58,7 +58,7 @@
58
  "frame_shift_ms": null,
59
  "frame_length_ms": null,
60
  "stft_pad_mode": "reflect",
61
- "sample_rate": 16000,
62
  "resample": false,
63
  "preemphasis": 0.0,
64
  "ref_level_db": 20,
@@ -78,7 +78,7 @@
78
  "do_amp_to_db_mel": true,
79
  "pitch_fmax": 640.0,
80
  "pitch_fmin": 0.0,
81
- "signal_norm": false,
82
  "min_level_db": -100,
83
  "symmetric_norm": true,
84
  "max_norm": 4.0,
@@ -100,7 +100,7 @@
100
  "eos": "<EOS>",
101
  "bos": "<BOS>",
102
  "blank": "<BLNK>",
103
- "characters": "\u0430\u0431\u0432\u0433\u0491\u0434\u0435\u0454\u0436\u0437\u0438\u0456\u0457\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u044d\u0451\u044b\u044a",
104
  "punctuations": "!'(),-.:;? ",
105
  "phonemes": null,
106
  "is_unique": true,
@@ -111,7 +111,7 @@
111
  "loss_masking": null,
112
  "sort_by_audio_len": false,
113
  "min_audio_len": 32768,
114
- "max_audio_len": 224000,
115
  "min_text_len": 1,
116
  "max_text_len": Infinity,
117
  "compute_f0": false,
@@ -120,13 +120,10 @@
120
  "start_by_longest": false,
121
  "datasets": [
122
  {
123
- "name": "mailabs",
124
- "path": "./logs/uk_UK",
125
- "meta_file_train": "",
126
- "ignored_speakers": [
127
- "obruchov",
128
- "shepel"
129
- ],
130
  "language": "uk",
131
  "meta_file_val": "",
132
  "meta_file_attn_mask": ""
@@ -135,7 +132,7 @@
135
  "test_sentences": [
136
  [
137
  "\u0412\u0435\u0441\u0435\u043b\u043a\u0430, \u0442\u0430\u043a\u043e\u0436 \u0440\u0430\u0439\u0434\u0443\u0433\u0430 \u043e\u043f\u0442\u0438\u0447\u043d\u0435 \u044f\u0432\u0438\u0449\u0435 \u0432 \u0430\u0442\u043c\u043e\u0441\u0444\u0435\u0440\u0456, \u0449\u043e \u044f\u0432\u043b\u044f\u0454 \u0441\u043e\u0431\u043e\u044e \u043e\u0434\u043d\u0443, \u0434\u0432\u0456 \u0447\u0438 \u0434\u0435\u043a\u0456\u043b\u044c\u043a\u0430 \u0440\u0456\u0437\u043d\u043e\u043a\u043e\u043b\u044c\u043e\u0440\u043e\u0432\u0438\u0445 \u0434\u0443\u0433.",
138
- "sumska",
139
  null,
140
  "uk"
141
  ]
@@ -147,7 +144,7 @@
147
  "use_language_weighted_sampler": true,
148
  "language_weighted_sampler_alpha": 1.0,
149
  "model_args": {
150
- "num_chars": 52,
151
  "out_channels": 513,
152
  "spec_segment_size": 32,
153
  "hidden_channels": 192,
@@ -166,38 +163,33 @@
166
  "resblock_type_decoder": "2",
167
  "resblock_kernel_sizes_decoder": [
168
  3,
169
- 7,
170
- 11
171
  ],
172
  "resblock_dilation_sizes_decoder": [
173
  [
174
  1,
175
- 3,
176
- 5
177
  ],
178
  [
179
- 1,
180
- 3,
181
- 5
182
  ],
183
  [
184
- 1,
185
  3,
186
- 5
187
  ]
188
  ],
189
  "upsample_rates_decoder": [
190
  8,
191
  8,
192
- 2,
193
- 2
194
  ],
195
- "upsample_initial_channel_decoder": 512,
196
  "upsample_kernel_sizes_decoder": [
197
  16,
198
  16,
199
- 4,
200
- 4
201
  ],
202
  "use_sdp": true,
203
  "noise_scale": 1.0,
 
23
  "distributed_url": "tcp://localhost:54321",
24
  "mixed_precision": true,
25
  "epochs": 1000,
26
+ "batch_size": 32,
27
  "eval_batch_size": 4,
28
  "grad_clip": [
29
  1000,
 
45
  "use_grad_scaler": false,
46
  "cudnn_enable": true,
47
  "cudnn_deterministic": false,
48
+ "cudnn_benchmark": false,
49
  "training_seed": 54321,
50
  "model": "vits",
51
  "num_loader_workers": 8,
 
58
  "frame_shift_ms": null,
59
  "frame_length_ms": null,
60
  "stft_pad_mode": "reflect",
61
+ "sample_rate": 22050,
62
  "resample": false,
63
  "preemphasis": 0.0,
64
  "ref_level_db": 20,
 
78
  "do_amp_to_db_mel": true,
79
  "pitch_fmax": 640.0,
80
  "pitch_fmin": 0.0,
81
+ "signal_norm": true,
82
  "min_level_db": -100,
83
  "symmetric_norm": true,
84
  "max_norm": 4.0,
 
100
  "eos": "<EOS>",
101
  "bos": "<BOS>",
102
  "blank": "<BLNK>",
103
+ "characters": "\u0430\u0431\u0432\u0433\u0491\u0434\u0435\u0454\u0436\u0437\u0438\u0456\u0457\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u044d\u0451\u044b\u044a+",
104
  "punctuations": "!'(),-.:;? ",
105
  "phonemes": null,
106
  "is_unique": true,
 
111
  "loss_masking": null,
112
  "sort_by_audio_len": false,
113
  "min_audio_len": 32768,
114
+ "max_audio_len": 132300,
115
  "min_text_len": 1,
116
  "max_text_len": Infinity,
117
  "compute_f0": false,
 
120
  "start_by_longest": false,
121
  "datasets": [
122
  {
123
+ "name": "ljspeech",
124
+ "path": "./datasets/uk_Mykyta",
125
+ "meta_file_train": "metadata_not.csv",
126
+ "ignored_speakers": null,
 
 
 
127
  "language": "uk",
128
  "meta_file_val": "",
129
  "meta_file_attn_mask": ""
 
132
  "test_sentences": [
133
  [
134
  "\u0412\u0435\u0441\u0435\u043b\u043a\u0430, \u0442\u0430\u043a\u043e\u0436 \u0440\u0430\u0439\u0434\u0443\u0433\u0430 \u043e\u043f\u0442\u0438\u0447\u043d\u0435 \u044f\u0432\u0438\u0449\u0435 \u0432 \u0430\u0442\u043c\u043e\u0441\u0444\u0435\u0440\u0456, \u0449\u043e \u044f\u0432\u043b\u044f\u0454 \u0441\u043e\u0431\u043e\u044e \u043e\u0434\u043d\u0443, \u0434\u0432\u0456 \u0447\u0438 \u0434\u0435\u043a\u0456\u043b\u044c\u043a\u0430 \u0440\u0456\u0437\u043d\u043e\u043a\u043e\u043b\u044c\u043e\u0440\u043e\u0432\u0438\u0445 \u0434\u0443\u0433.",
135
+ "ljspeech",
136
  null,
137
  "uk"
138
  ]
 
144
  "use_language_weighted_sampler": true,
145
  "language_weighted_sampler_alpha": 1.0,
146
  "model_args": {
147
+ "num_chars": 53,
148
  "out_channels": 513,
149
  "spec_segment_size": 32,
150
  "hidden_channels": 192,
 
163
  "resblock_type_decoder": "2",
164
  "resblock_kernel_sizes_decoder": [
165
  3,
166
+ 5,
167
+ 7
168
  ],
169
  "resblock_dilation_sizes_decoder": [
170
  [
171
  1,
172
+ 2
 
173
  ],
174
  [
175
+ 2,
176
+ 6
 
177
  ],
178
  [
 
179
  3,
180
+ 12
181
  ]
182
  ],
183
  "upsample_rates_decoder": [
184
  8,
185
  8,
186
+ 4
 
187
  ],
188
+ "upsample_initial_channel_decoder": 256,
189
  "upsample_kernel_sizes_decoder": [
190
  16,
191
  16,
192
+ 8
 
193
  ],
194
  "use_sdp": true,
195
  "noise_scale": 1.0,
speaker_ids.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "uk": 0,
3
- "sumska": 0
4
  }
 
1
  {
2
  "uk": 0,
3
+ "mykyta": 0
4
  }