Roh commited on
Commit
342a0e6
1 Parent(s): 6607412

added readme file

Browse files
Files changed (1) hide show
  1. README.md +284 -0
README.md ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ryanspeech
9
+ license: cc-by-nc-4.0
10
+ widget:
11
+ - text: "This seems a very pleasant place, and I think I shall enjoy myself very much."
12
+ ---
13
+ ## RyanSpeech model (based on ESPnet2)
14
+
15
+ ### `espnet/english_male_ryanspeech_tacotron`
16
+ This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+
19
+ ## Download the dataset
20
+ You can download RyanSpeech dataset from [here](https://www.kaggle.com/datasets/roholazandie/ryanspeech) or here.
21
+
22
+ ## TTS config
23
+
24
+ <details><summary>expand</summary>
25
+
26
+ ```
27
+ config: conf/train.yaml
28
+ print_config: false
29
+ log_level: INFO
30
+ dry_run: false
31
+ iterator_type: sequence
32
+ output_dir: exp/tts_train_raw_phn_tacotron_g2p_en_no_space
33
+ ngpu: 1
34
+ seed: 0
35
+ num_workers: 1
36
+ num_att_plot: 3
37
+ dist_backend: nccl
38
+ dist_init_method: env://
39
+ dist_world_size: null
40
+ dist_rank: null
41
+ local_rank: 0
42
+ dist_master_addr: null
43
+ dist_master_port: null
44
+ dist_launcher: null
45
+ multiprocessing_distributed: false
46
+ cudnn_enabled: true
47
+ cudnn_benchmark: false
48
+ cudnn_deterministic: true
49
+ collect_stats: false
50
+ write_collected_feats: false
51
+ max_epoch: 200
52
+ patience: null
53
+ val_scheduler_criterion:
54
+ - valid
55
+ - loss
56
+ early_stopping_criterion:
57
+ - valid
58
+ - loss
59
+ - min
60
+ best_model_criterion:
61
+ - - valid
62
+ - loss
63
+ - min
64
+ - - train
65
+ - loss
66
+ - min
67
+ keep_nbest_models: 5
68
+ grad_clip: 1.0
69
+ grad_clip_type: 2.0
70
+ grad_noise: false
71
+ accum_grad: 1
72
+ no_forward_run: false
73
+ resume: true
74
+ train_dtype: float32
75
+ use_amp: false
76
+ log_interval: null
77
+ pretrain_path: []
78
+ pretrain_key: []
79
+ num_iters_per_epoch: 500
80
+ batch_size: 20
81
+ valid_batch_size: null
82
+ batch_bins: 5120000
83
+ valid_batch_bins: null
84
+ train_shape_file:
85
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
86
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
87
+ valid_shape_file:
88
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
89
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
90
+ batch_type: numel
91
+ valid_batch_type: null
92
+ fold_length:
93
+ - 150
94
+ - 204800
95
+ sort_in_batch: descending
96
+ sort_batch: descending
97
+ multiple_iterator: false
98
+ chunk_length: 500
99
+ chunk_shift_ratio: 0.5
100
+ num_cache_chunks: 1024
101
+ train_data_path_and_name_and_type:
102
+ - - dump/raw/tr_no_dev/text
103
+ - text
104
+ - text
105
+ - - dump/raw/tr_no_dev/wav.scp
106
+ - speech
107
+ - sound
108
+ valid_data_path_and_name_and_type:
109
+ - - dump/raw/dev/text
110
+ - text
111
+ - text
112
+ - - dump/raw/dev/wav.scp
113
+ - speech
114
+ - sound
115
+ allow_variable_data_keys: false
116
+ max_cache_size: 0.0
117
+ max_cache_fd: 32
118
+ valid_max_cache_size: null
119
+ optim: adam
120
+ optim_conf:
121
+ lr: 0.001
122
+ eps: 1.0e-06
123
+ weight_decay: 0.0
124
+ scheduler: null
125
+ scheduler_conf: {}
126
+ token_list:
127
+ - <blank>
128
+ - <unk>
129
+ - AH0
130
+ - T
131
+ - N
132
+ - S
133
+ - R
134
+ - D
135
+ - L
136
+ - K
137
+ - IH1
138
+ - M
139
+ - EH1
140
+ - Z
141
+ - DH
142
+ - UW1
143
+ - AE1
144
+ - IH0
145
+ - AY1
146
+ - AH1
147
+ - W
148
+ - .
149
+ - P
150
+ - F
151
+ - IY1
152
+ - V
153
+ - ER0
154
+ - AA1
155
+ - B
156
+ - AO1
157
+ - HH
158
+ - EY1
159
+ - IY0
160
+ - ','
161
+ - Y
162
+ - NG
163
+ - OW1
164
+ - G
165
+ - AW1
166
+ - TH
167
+ - SH
168
+ - UH1
169
+ - '?'
170
+ - ER1
171
+ - JH
172
+ - CH
173
+ - OW0
174
+ - OW2
175
+ - EH2
176
+ - IH2
177
+ - EY2
178
+ - AA2
179
+ - AE2
180
+ - AY2
181
+ - ''''
182
+ - OY1
183
+ - UW0
184
+ - '!'
185
+ - AO2
186
+ - EH0
187
+ - ZH
188
+ - AH2
189
+ - AE0
190
+ - UW2
191
+ - AA0
192
+ - AY0
193
+ - IY2
194
+ - AW2
195
+ - AO0
196
+ - EY0
197
+ - ER2
198
+ - UH2
199
+ - '...'
200
+ - AW0
201
+ - UH0
202
+ - OY2
203
+ - <sos/eos>
204
+ odim: null
205
+ model_conf: {}
206
+ use_preprocessor: true
207
+ token_type: phn
208
+ bpemodel: null
209
+ non_linguistic_symbols: null
210
+ cleaner: tacotron
211
+ g2p: g2p_en_no_space
212
+ feats_extract: fbank
213
+ feats_extract_conf:
214
+ fs: 22050
215
+ fmin: 80
216
+ fmax: 7600
217
+ n_mels: 80
218
+ hop_length: 256
219
+ n_fft: 1024
220
+ win_length: null
221
+ normalize: global_mvn
222
+ normalize_conf:
223
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
224
+ tts: tacotron2
225
+ tts_conf:
226
+ embed_dim: 512
227
+ elayers: 1
228
+ eunits: 512
229
+ econv_layers: 3
230
+ econv_chans: 512
231
+ econv_filts: 5
232
+ atype: location
233
+ adim: 512
234
+ aconv_chans: 32
235
+ aconv_filts: 15
236
+ cumulate_att_w: true
237
+ dlayers: 2
238
+ dunits: 1024
239
+ prenet_layers: 2
240
+ prenet_units: 256
241
+ postnet_layers: 5
242
+ postnet_chans: 512
243
+ postnet_filts: 5
244
+ output_activation: null
245
+ use_batch_norm: true
246
+ use_concate: true
247
+ use_residual: false
248
+ dropout_rate: 0.5
249
+ zoneout_rate: 0.1
250
+ reduction_factor: 1
251
+ spk_embed_dim: null
252
+ use_masking: true
253
+ bce_pos_weight: 5.0
254
+ use_guided_attn_loss: true
255
+ guided_attn_loss_sigma: 0.4
256
+ guided_attn_loss_lambda: 1.0
257
+ pitch_extract: null
258
+ pitch_extract_conf: {}
259
+ pitch_normalize: null
260
+ pitch_normalize_conf: {}
261
+ energy_extract: null
262
+ energy_extract_conf: {}
263
+ energy_normalize: null
264
+ energy_normalize_conf: {}
265
+ required:
266
+ - output_dir
267
+ - token_list
268
+ distributed: false
269
+
270
+ ```
271
+
272
+ </details>
273
+
274
+
275
+ ### Citing RyanSpeech
276
+
277
+ ```BibTex
278
+ @inproceedings{Zandie2021RyanSpeechAC,
279
+ title={RyanSpeech: A Corpus for Conversational Text-to-Speech Synthesis},
280
+ author={Rohola Zandie and Mohammad H. Mahoor and Julia Madsen and Eshrat S. Emamian},
281
+ booktitle={Interspeech},
282
+ year={2021}
283
+ }
284
+ ```