Kamtera commited on
Commit
dd0746e
1 Parent(s): 88fcd3f

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +53 -48
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "output_path": "/kaggle/working/train_output/kagg",
3
  "logger_uri": null,
4
  "run_name": "vits_fa_female",
5
  "project_name": null,
@@ -98,54 +98,53 @@
98
  "drop_last": false,
99
  "datasets": [
100
  {
101
- "formatter": "mozilla",
102
- "dataset_name": "",
103
- "path": "/kaggle/input/persian-tts-dataset",
104
  "meta_file_train": "metadata.csv",
105
  "ignored_speakers": null,
106
- "language": "",
107
- "phonemizer": "",
108
- "meta_file_val": "",
109
- "meta_file_attn_mask": ""
110
- },
111
- {
112
- "formatter": "mozilla",
113
- "dataset_name": "",
114
- "path": "/kaggle/input/persian-tts-dataset-famale",
115
- "meta_file_train": "metadata.csv",
116
- "ignored_speakers": null,
117
- "language": "",
118
- "phonemizer": "",
119
- "meta_file_val": "",
120
- "meta_file_attn_mask": ""
121
- },
122
- {
123
- "formatter": "mozilla",
124
- "dataset_name": "",
125
- "path": "/kaggle/input/persian-tts-dataset-male",
126
- "meta_file_train": "metadata.csv",
127
- "ignored_speakers": null,
128
- "language": "",
129
- "phonemizer": "",
130
  "meta_file_val": "",
131
  "meta_file_attn_mask": ""
132
  }
133
  ],
134
  "test_sentences": [
135
  [
136
- "\u0633\u0644\u0637\u0627\u0646 \u0645\u062d\u0645\u0648\u062f \u062f\u0631 \u0632\u0645\u0633\u062a\u0627\u0646\u06cc \u0633\u062e\u062a \u0628\u0647 \u0637\u0644\u062e\u06a9 \u06af\u0641\u062a \u06a9\u0647: \u0628\u0627 \u0627\u06cc\u0646 \u062c\u0627\u0645\u0647 \u06cc \u06cc\u06a9 \u0644\u0627 \u062f\u0631 \u0627\u06cc\u0646 \u0633\u0631\u0645\u0627 \u0686\u0647 \u0645\u06cc \u06a9\u0646\u06cc "
 
 
 
 
 
 
 
 
 
137
  ],
138
  [
139
- "\u0645\u0631\u062f\u06cc \u0646\u0632\u062f \u0628\u0642\u0627\u0644\u06cc \u0622\u0645\u062f \u0648 \u06af\u0641\u062a \u067e\u06cc\u0627\u0632 \u0647\u0645 \u062f\u0647 \u062a\u0627 \u062f\u0647\u0627\u0646 \u0628\u062f\u0627\u0646 \u062e\u0648 \u0634\u0628\u0648\u06cc \u0633\u0627\u0632\u0645."
 
 
 
140
  ],
141
  [
142
- "\u0627\u0632 \u0645\u0627\u0644 \u062e\u0648\u062f \u067e\u0627\u0631\u0647 \u0627\u06cc \u06af\u0648\u0634\u062a \u0628\u0633\u062a\u0627\u0646 \u0648 \u0632\u06cc\u0631\u0647 \u0628\u0627\u06cc\u06cc \u0645\u0639\u0637\u0651\u0631 \u0628\u0633\u0627\u0632"
 
 
 
143
  ],
144
  [
145
- "\u06cc\u06a9 \u0628\u0627\u0631 \u0647\u0645 \u0627\u0632 \u062c\u0647\u0646\u0645 \u0628\u06af\u0648\u06cc\u06cc\u062f."
 
 
 
146
  ],
147
  [
148
- "\u06cc\u06a9\u06cc \u0627\u0633\u0628\u06cc \u0628\u0647 \u0639\u0627\u0631\u06cc\u062a \u062e\u0648\u0627\u0633\u062a"
 
 
 
149
  ]
150
  ],
151
  "eval_split_max_size": null,
@@ -163,7 +162,7 @@
163
  "hidden_channels": 192,
164
  "hidden_channels_ffn_text_encoder": 768,
165
  "num_heads_text_encoder": 2,
166
- "num_layers_text_encoder": 6,
167
  "kernel_size_text_encoder": 3,
168
  "dropout_p_text_encoder": 0.1,
169
  "dropout_p_duration_predictor": 0.5,
@@ -227,19 +226,21 @@
227
  "use_spectral_norm_disriminator": false,
228
  "use_speaker_embedding": false,
229
  "num_speakers": 0,
230
- "speakers_file": "train_output/kagg/speakers.pth",
231
- "d_vector_file": null,
 
 
232
  "speaker_embedding_channels": 256,
233
- "use_d_vector_file": false,
234
- "d_vector_dim": 0,
235
  "detach_dp_input": true,
236
  "use_language_embedding": false,
237
  "embedded_language_dim": 4,
238
  "num_languages": 0,
239
  "language_ids_file": null,
240
  "use_speaker_encoder_as_loss": false,
241
- "speaker_encoder_config_path": "",
242
- "speaker_encoder_model_path": "",
243
  "condition_dp_on_speaker": true,
244
  "freeze_encoder": false,
245
  "freeze_DP": false,
@@ -269,19 +270,23 @@
269
  "feat_loss_alpha": 1.0,
270
  "mel_loss_alpha": 45.0,
271
  "dur_loss_alpha": 1.0,
272
- "speaker_encoder_loss_alpha": 1.0,
273
  "return_wav": true,
274
- "use_weighted_sampler": false,
275
- "weighted_sampler_attrs": {},
 
 
276
  "weighted_sampler_multipliers": {},
277
  "r": 1,
278
- "num_speakers": 3,
279
  "use_speaker_embedding": false,
280
- "speakers_file": "train_output/kagg/speakers.pth",
281
  "speaker_embedding_channels": 256,
282
  "language_ids_file": null,
283
  "use_language_embedding": false,
284
- "use_d_vector_file": false,
285
- "d_vector_file": null,
286
- "d_vector_dim": 0
 
 
287
  }
 
1
  {
2
+ "output_path": "/kaggle/working/train_output",
3
  "logger_uri": null,
4
  "run_name": "vits_fa_female",
5
  "project_name": null,
 
98
  "drop_last": false,
99
  "datasets": [
100
  {
101
+ "formatter": "mozilla_with_speaker",
102
+ "dataset_name": "multi_persian",
103
+ "path": "/kaggle/input",
104
  "meta_file_train": "metadata.csv",
105
  "ignored_speakers": null,
106
+ "language": "fa",
107
+ "phonemizer": "espeak",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  "meta_file_val": "",
109
  "meta_file_attn_mask": ""
110
  }
111
  ],
112
  "test_sentences": [
113
  [
114
+ "\u0633\u0644\u0637\u0627\u0646 \u0645\u062d\u0645\u0648\u062f \u062f\u0631 \u0632\u0645\u0633\u062a\u0627\u0646\u06cc \u0633\u062e\u062a \u0628\u0647 \u0637\u0644\u062e\u06a9 \u06af\u0641\u062a \u06a9\u0647",
115
+ "dilara",
116
+ null,
117
+ "fa"
118
+ ],
119
+ [
120
+ " \u0628\u0627 \u0627\u06cc\u0646 \u062c\u0627\u0645\u0647 \u06cc \u06cc\u06a9 \u0644\u0627 \u062f\u0631 \u0627\u06cc\u0646 \u0633\u0631\u0645\u0627 \u0686\u0647 \u0645\u06cc \u06a9\u0646\u06cc ",
121
+ "farid",
122
+ null,
123
+ "fa"
124
  ],
125
  [
126
+ "\u0645\u0631\u062f\u06cc \u0646\u0632\u062f \u0628\u0642\u0627\u0644\u06cc \u0622\u0645\u062f \u0648 \u06af\u0641\u062a \u067e\u06cc\u0627\u0632 \u0647\u0645 \u062f\u0647 \u062a\u0627 \u062f\u0647\u0627\u0646 \u0628\u062f\u0627\u0646 \u062e\u0648 \u0634\u0628\u0648\u06cc \u0633\u0627\u0632\u0645.",
127
+ "farid",
128
+ null,
129
+ "fa"
130
  ],
131
  [
132
+ "\u0627\u0632 \u0645\u0627\u0644 \u062e\u0648\u062f \u067e\u0627\u0631\u0647 \u0627\u06cc \u06af\u0648\u0634\u062a \u0628\u0633\u062a\u0627\u0646 \u0648 \u0632\u06cc\u0631\u0647 \u0628\u0627\u06cc\u06cc \u0645\u0639\u0637\u0651\u0631 \u0628\u0633\u0627\u0632",
133
+ "dilara",
134
+ null,
135
+ "fa"
136
  ],
137
  [
138
+ "\u06cc\u06a9 \u0628\u0627\u0631 \u0647\u0645 \u0627\u0632 \u062c\u0647\u0646\u0645 \u0628\u06af\u0648\u06cc\u06cc\u062f.",
139
+ "changiz",
140
+ null,
141
+ "fa"
142
  ],
143
  [
144
+ "\u06cc\u06a9\u06cc \u0627\u0633\u0628\u06cc \u0628\u0647 \u0639\u0627\u0631\u06cc\u062a \u062e\u0648\u0627\u0633\u062a",
145
+ "changiz",
146
+ null,
147
+ "fa"
148
  ]
149
  ],
150
  "eval_split_max_size": null,
 
162
  "hidden_channels": 192,
163
  "hidden_channels_ffn_text_encoder": 768,
164
  "num_heads_text_encoder": 2,
165
+ "num_layers_text_encoder": 10,
166
  "kernel_size_text_encoder": 3,
167
  "dropout_p_text_encoder": 0.1,
168
  "dropout_p_duration_predictor": 0.5,
 
226
  "use_spectral_norm_disriminator": false,
227
  "use_speaker_embedding": false,
228
  "num_speakers": 0,
229
+ "speakers_file": "train_output/vits_fa_female-March-18-2023_08+54PM-0000000/speakers.pth",
230
+ "d_vector_file": [
231
+ "/kaggle/working/speakers.pth"
232
+ ],
233
  "speaker_embedding_channels": 256,
234
+ "use_d_vector_file": true,
235
+ "d_vector_dim": 512,
236
  "detach_dp_input": true,
237
  "use_language_embedding": false,
238
  "embedded_language_dim": 4,
239
  "num_languages": 0,
240
  "language_ids_file": null,
241
  "use_speaker_encoder_as_loss": false,
242
+ "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
243
+ "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
244
  "condition_dp_on_speaker": true,
245
  "freeze_encoder": false,
246
  "freeze_DP": false,
 
270
  "feat_loss_alpha": 1.0,
271
  "mel_loss_alpha": 45.0,
272
  "dur_loss_alpha": 1.0,
273
+ "speaker_encoder_loss_alpha": 9.0,
274
  "return_wav": true,
275
+ "use_weighted_sampler": true,
276
+ "weighted_sampler_attrs": {
277
+ "speaker_name": 1.0
278
+ },
279
  "weighted_sampler_multipliers": {},
280
  "r": 1,
281
+ "num_speakers": 0,
282
  "use_speaker_embedding": false,
283
+ "speakers_file": "train_output/vits_fa_female-March-18-2023_08+54PM-0000000/speakers.pth",
284
  "speaker_embedding_channels": 256,
285
  "language_ids_file": null,
286
  "use_language_embedding": false,
287
+ "use_d_vector_file": true,
288
+ "d_vector_file": [
289
+ "/kaggle/working/speakers.pth"
290
+ ],
291
+ "d_vector_dim": 512
292
  }