Roh commited on
Commit
f6455e0
1 Parent(s): c3730dc

added readme file

Browse files
Files changed (1) hide show
  1. README.md +286 -0
README.md ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ryanspeech
9
+ license: cc-by-nc-4.0
10
+ widget:
11
+ - text: "This seems a very pleasant place, and I think I shall enjoy myself very much."
12
+ ---
13
+ ## RyanSpeech model (based on ESPnet2)
14
+
15
+ ### `espnet/english_ryanspeech_fastspeech`
16
+ This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+
19
+ ## Download the dataset
20
+ You can download RyanSpeech dataset from [here](https://www.kaggle.com/datasets/roholazandie/ryanspeech) or here.
21
+
22
+ ## TTS config
23
+
24
+ <details><summary>expand</summary>
25
+
26
+ ```
27
+ config: conf/tuning/train_fastspeech.yaml
28
+ print_config: false
29
+ log_level: INFO
30
+ dry_run: false
31
+ iterator_type: sequence
32
+ output_dir: exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space
33
+ ngpu: 1
34
+ seed: 0
35
+ num_workers: 1
36
+ num_att_plot: 3
37
+ dist_backend: nccl
38
+ dist_init_method: env://
39
+ dist_world_size: null
40
+ dist_rank: null
41
+ local_rank: 0
42
+ dist_master_addr: null
43
+ dist_master_port: null
44
+ dist_launcher: null
45
+ multiprocessing_distributed: false
46
+ cudnn_enabled: true
47
+ cudnn_benchmark: false
48
+ cudnn_deterministic: true
49
+ collect_stats: false
50
+ write_collected_feats: false
51
+ max_epoch: 1000
52
+ patience: null
53
+ val_scheduler_criterion:
54
+ - valid
55
+ - loss
56
+ early_stopping_criterion:
57
+ - valid
58
+ - loss
59
+ - min
60
+ best_model_criterion:
61
+ - - valid
62
+ - loss
63
+ - min
64
+ - - train
65
+ - loss
66
+ - min
67
+ keep_nbest_models: 5
68
+ grad_clip: 1.0
69
+ grad_clip_type: 2.0
70
+ grad_noise: false
71
+ accum_grad: 6
72
+ no_forward_run: false
73
+ resume: true
74
+ train_dtype: float32
75
+ use_amp: false
76
+ log_interval: null
77
+ pretrain_path: []
78
+ pretrain_key: []
79
+ num_iters_per_epoch: 500
80
+ batch_size: 20
81
+ valid_batch_size: null
82
+ batch_bins: 800000
83
+ valid_batch_bins: null
84
+ train_shape_file:
85
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/text_shape.phn
86
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/speech_shape
87
+ valid_shape_file:
88
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/text_shape.phn
89
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/speech_shape
90
+ batch_type: numel
91
+ valid_batch_type: null
92
+ fold_length:
93
+ - 150
94
+ - 204800
95
+ sort_in_batch: descending
96
+ sort_batch: descending
97
+ multiple_iterator: false
98
+ chunk_length: 500
99
+ chunk_shift_ratio: 0.5
100
+ num_cache_chunks: 1024
101
+ train_data_path_and_name_and_type:
102
+ - - dump/raw/tr_no_dev/text
103
+ - text
104
+ - text
105
+ - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//tr_no_dev/durations
106
+ - durations
107
+ - text_int
108
+ - - dump/raw/tr_no_dev/wav.scp
109
+ - speech
110
+ - sound
111
+ valid_data_path_and_name_and_type:
112
+ - - dump/raw/dev/text
113
+ - text
114
+ - text
115
+ - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//dev/durations
116
+ - durations
117
+ - text_int
118
+ - - dump/raw/dev/wav.scp
119
+ - speech
120
+ - sound
121
+ allow_variable_data_keys: false
122
+ max_cache_size: 0.0
123
+ max_cache_fd: 32
124
+ valid_max_cache_size: null
125
+ optim: adam
126
+ optim_conf:
127
+ lr: 1.0
128
+ scheduler: noamlr
129
+ scheduler_conf:
130
+ model_size: 384
131
+ warmup_steps: 4000
132
+ token_list:
133
+ - <blank>
134
+ - <unk>
135
+ - AH0
136
+ - T
137
+ - N
138
+ - S
139
+ - R
140
+ - D
141
+ - L
142
+ - K
143
+ - IH1
144
+ - M
145
+ - EH1
146
+ - Z
147
+ - DH
148
+ - UW1
149
+ - AE1
150
+ - IH0
151
+ - AY1
152
+ - AH1
153
+ - W
154
+ - .
155
+ - P
156
+ - F
157
+ - IY1
158
+ - V
159
+ - ER0
160
+ - AA1
161
+ - B
162
+ - AO1
163
+ - HH
164
+ - EY1
165
+ - IY0
166
+ - ','
167
+ - Y
168
+ - NG
169
+ - OW1
170
+ - G
171
+ - AW1
172
+ - TH
173
+ - SH
174
+ - UH1
175
+ - '?'
176
+ - ER1
177
+ - JH
178
+ - CH
179
+ - OW0
180
+ - OW2
181
+ - EH2
182
+ - IH2
183
+ - EY2
184
+ - AA2
185
+ - AE2
186
+ - AY2
187
+ - ''''
188
+ - OY1
189
+ - UW0
190
+ - '!'
191
+ - AO2
192
+ - EH0
193
+ - ZH
194
+ - AH2
195
+ - AE0
196
+ - UW2
197
+ - AA0
198
+ - AY0
199
+ - IY2
200
+ - AW2
201
+ - AO0
202
+ - EY0
203
+ - ER2
204
+ - UH2
205
+ - '...'
206
+ - AW0
207
+ - UH0
208
+ - OY2
209
+ - <sos/eos>
210
+ odim: null
211
+ model_conf: {}
212
+ use_preprocessor: true
213
+ token_type: phn
214
+ bpemodel: null
215
+ non_linguistic_symbols: null
216
+ cleaner: tacotron
217
+ g2p: g2p_en_no_space
218
+ feats_extract: fbank
219
+ feats_extract_conf:
220
+ fs: 22050
221
+ fmin: 80
222
+ fmax: 7600
223
+ n_mels: 80
224
+ hop_length: 256
225
+ n_fft: 1024
226
+ win_length: null
227
+ normalize: global_mvn
228
+ normalize_conf:
229
+ stats_file: exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/feats_stats.npz
230
+ tts: fastspeech
231
+ tts_conf:
232
+ adim: 384
233
+ aheads: 2
234
+ elayers: 6
235
+ eunits: 1536
236
+ dlayers: 6
237
+ dunits: 1536
238
+ positionwise_layer_type: conv1d
239
+ positionwise_conv_kernel_size: 3
240
+ duration_predictor_layers: 2
241
+ duration_predictor_chans: 384
242
+ duration_predictor_kernel_size: 3
243
+ postnet_layers: 5
244
+ postnet_filts: 5
245
+ postnet_chans: 256
246
+ use_masking: true
247
+ use_scaled_pos_enc: true
248
+ encoder_normalize_before: true
249
+ decoder_normalize_before: true
250
+ reduction_factor: 1
251
+ init_type: xavier_uniform
252
+ init_enc_alpha: 1.0
253
+ init_dec_alpha: 1.0
254
+ transformer_enc_dropout_rate: 0.1
255
+ transformer_enc_positional_dropout_rate: 0.1
256
+ transformer_enc_attn_dropout_rate: 0.1
257
+ transformer_dec_dropout_rate: 0.1
258
+ transformer_dec_positional_dropout_rate: 0.1
259
+ transformer_dec_attn_dropout_rate: 0.1
260
+ pitch_extract: null
261
+ pitch_extract_conf: {}
262
+ pitch_normalize: null
263
+ pitch_normalize_conf: {}
264
+ energy_extract: null
265
+ energy_extract_conf: {}
266
+ energy_normalize: null
267
+ energy_normalize_conf: {}
268
+ required:
269
+ - output_dir
270
+ - token_list
271
+ distributed: false
272
+ ```
273
+
274
+ </details>
275
+
276
+
277
+ ### Citing RyanSpeech
278
+
279
+ ```BibTex
280
+ @inproceedings{Zandie2021RyanSpeechAC,
281
+ title={RyanSpeech: A Corpus for Conversational Text-to-Speech Synthesis},
282
+ author={Rohola Zandie and Mohammad H. Mahoor and Julia Madsen and Eshrat S. Emamian},
283
+ booktitle={Interspeech},
284
+ year={2021}
285
+ }
286
+ ```