Roh commited on
Commit
faafadc
1 Parent(s): bd87a1e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +269 -2
README.md CHANGED
@@ -10,7 +10,274 @@ license: cc-by-nc-4.0
10
  widget:
11
  - text: "Ryan is a socially assistive robot."
12
  ---
13
- ## ESPnet2 TTS model (RyanSpeech TTS)
14
 
15
  ### `espnet/english_ryanspeech_fastspeech`
16
- This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  widget:
11
  - text: "Ryan is a socially assistive robot."
12
  ---
13
+ ## RyanSpeech model (based on ESPnet2)
14
 
15
  ### `espnet/english_ryanspeech_fastspeech`
16
+ This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+
19
+ ## TTS config
20
+
21
+ <details><summary>expand</summary>
22
+
23
+ ```
24
+ config: conf/tuning/train_fastspeech.yaml
25
+ print_config: false
26
+ log_level: INFO
27
+ dry_run: false
28
+ iterator_type: sequence
29
+ output_dir: exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space
30
+ ngpu: 1
31
+ seed: 0
32
+ num_workers: 1
33
+ num_att_plot: 3
34
+ dist_backend: nccl
35
+ dist_init_method: env://
36
+ dist_world_size: null
37
+ dist_rank: null
38
+ local_rank: 0
39
+ dist_master_addr: null
40
+ dist_master_port: null
41
+ dist_launcher: null
42
+ multiprocessing_distributed: false
43
+ cudnn_enabled: true
44
+ cudnn_benchmark: false
45
+ cudnn_deterministic: true
46
+ collect_stats: false
47
+ write_collected_feats: false
48
+ max_epoch: 1000
49
+ patience: null
50
+ val_scheduler_criterion:
51
+ - valid
52
+ - loss
53
+ early_stopping_criterion:
54
+ - valid
55
+ - loss
56
+ - min
57
+ best_model_criterion:
58
+ - - valid
59
+ - loss
60
+ - min
61
+ - - train
62
+ - loss
63
+ - min
64
+ keep_nbest_models: 5
65
+ grad_clip: 1.0
66
+ grad_clip_type: 2.0
67
+ grad_noise: false
68
+ accum_grad: 6
69
+ no_forward_run: false
70
+ resume: true
71
+ train_dtype: float32
72
+ use_amp: false
73
+ log_interval: null
74
+ pretrain_path: []
75
+ pretrain_key: []
76
+ num_iters_per_epoch: 500
77
+ batch_size: 20
78
+ valid_batch_size: null
79
+ batch_bins: 800000
80
+ valid_batch_bins: null
81
+ train_shape_file:
82
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/text_shape.phn
83
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/speech_shape
84
+ valid_shape_file:
85
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/text_shape.phn
86
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/speech_shape
87
+ batch_type: numel
88
+ valid_batch_type: null
89
+ fold_length:
90
+ - 150
91
+ - 204800
92
+ sort_in_batch: descending
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 500
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 1024
98
+ train_data_path_and_name_and_type:
99
+ - - dump/raw/tr_no_dev/text
100
+ - text
101
+ - text
102
+ - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//tr_no_dev/durations
103
+ - durations
104
+ - text_int
105
+ - - dump/raw/tr_no_dev/wav.scp
106
+ - speech
107
+ - sound
108
+ valid_data_path_and_name_and_type:
109
+ - - dump/raw/dev/text
110
+ - text
111
+ - text
112
+ - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//dev/durations
113
+ - durations
114
+ - text_int
115
+ - - dump/raw/dev/wav.scp
116
+ - speech
117
+ - sound
118
+ allow_variable_data_keys: false
119
+ max_cache_size: 0.0
120
+ max_cache_fd: 32
121
+ valid_max_cache_size: null
122
+ optim: adam
123
+ optim_conf:
124
+ lr: 1.0
125
+ scheduler: noamlr
126
+ scheduler_conf:
127
+ model_size: 384
128
+ warmup_steps: 4000
129
+ token_list:
130
+ - <blank>
131
+ - <unk>
132
+ - AH0
133
+ - T
134
+ - N
135
+ - S
136
+ - R
137
+ - D
138
+ - L
139
+ - K
140
+ - IH1
141
+ - M
142
+ - EH1
143
+ - Z
144
+ - DH
145
+ - UW1
146
+ - AE1
147
+ - IH0
148
+ - AY1
149
+ - AH1
150
+ - W
151
+ - .
152
+ - P
153
+ - F
154
+ - IY1
155
+ - V
156
+ - ER0
157
+ - AA1
158
+ - B
159
+ - AO1
160
+ - HH
161
+ - EY1
162
+ - IY0
163
+ - ','
164
+ - Y
165
+ - NG
166
+ - OW1
167
+ - G
168
+ - AW1
169
+ - TH
170
+ - SH
171
+ - UH1
172
+ - '?'
173
+ - ER1
174
+ - JH
175
+ - CH
176
+ - OW0
177
+ - OW2
178
+ - EH2
179
+ - IH2
180
+ - EY2
181
+ - AA2
182
+ - AE2
183
+ - AY2
184
+ - ''''
185
+ - OY1
186
+ - UW0
187
+ - '!'
188
+ - AO2
189
+ - EH0
190
+ - ZH
191
+ - AH2
192
+ - AE0
193
+ - UW2
194
+ - AA0
195
+ - AY0
196
+ - IY2
197
+ - AW2
198
+ - AO0
199
+ - EY0
200
+ - ER2
201
+ - UH2
202
+ - '...'
203
+ - AW0
204
+ - UH0
205
+ - OY2
206
+ - <sos/eos>
207
+ odim: null
208
+ model_conf: {}
209
+ use_preprocessor: true
210
+ token_type: phn
211
+ bpemodel: null
212
+ non_linguistic_symbols: null
213
+ cleaner: tacotron
214
+ g2p: g2p_en_no_space
215
+ feats_extract: fbank
216
+ feats_extract_conf:
217
+ fs: 22050
218
+ fmin: 80
219
+ fmax: 7600
220
+ n_mels: 80
221
+ hop_length: 256
222
+ n_fft: 1024
223
+ win_length: null
224
+ normalize: global_mvn
225
+ normalize_conf:
226
+ stats_file: exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/feats_stats.npz
227
+ tts: fastspeech
228
+ tts_conf:
229
+ adim: 384
230
+ aheads: 2
231
+ elayers: 6
232
+ eunits: 1536
233
+ dlayers: 6
234
+ dunits: 1536
235
+ positionwise_layer_type: conv1d
236
+ positionwise_conv_kernel_size: 3
237
+ duration_predictor_layers: 2
238
+ duration_predictor_chans: 384
239
+ duration_predictor_kernel_size: 3
240
+ postnet_layers: 5
241
+ postnet_filts: 5
242
+ postnet_chans: 256
243
+ use_masking: true
244
+ use_scaled_pos_enc: true
245
+ encoder_normalize_before: true
246
+ decoder_normalize_before: true
247
+ reduction_factor: 1
248
+ init_type: xavier_uniform
249
+ init_enc_alpha: 1.0
250
+ init_dec_alpha: 1.0
251
+ transformer_enc_dropout_rate: 0.1
252
+ transformer_enc_positional_dropout_rate: 0.1
253
+ transformer_enc_attn_dropout_rate: 0.1
254
+ transformer_dec_dropout_rate: 0.1
255
+ transformer_dec_positional_dropout_rate: 0.1
256
+ transformer_dec_attn_dropout_rate: 0.1
257
+ pitch_extract: null
258
+ pitch_extract_conf: {}
259
+ pitch_normalize: null
260
+ pitch_normalize_conf: {}
261
+ energy_extract: null
262
+ energy_extract_conf: {}
263
+ energy_normalize: null
264
+ energy_normalize_conf: {}
265
+ required:
266
+ - output_dir
267
+ - token_list
268
+ distributed: false
269
+ ```
270
+
271
+ </details>
272
+
273
+
274
+ ### Citing RyanSpeech
275
+
276
+ ```BibTex
277
+ @inproceedings{Zandie2021RyanSpeechAC,
278
+ title={RyanSpeech: A Corpus for Conversational Text-to-Speech Synthesis},
279
+ author={Rohola Zandie and Mohammad H. Mahoor and Julia Madsen and Eshrat S. Emamian},
280
+ booktitle={Interspeech},
281
+ year={2021}
282
+ }
283
+ ```