utkarsh2299 commited on
Commit
fb31477
·
verified ·
1 Parent(s): 3b5f98a

Upload 131 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. gujarati/female/model/config.yaml +278 -0
  2. gujarati/female/model/energy_stats.npz +3 -0
  3. gujarati/female/model/feats_stats.npz +3 -0
  4. gujarati/female/model/feats_type +1 -0
  5. gujarati/female/model/model.pth +3 -0
  6. gujarati/female/model/pitch_stats.npz +3 -0
  7. gujarati/male/model/config.yaml +276 -0
  8. gujarati/male/model/energy_stats.npz +3 -0
  9. gujarati/male/model/feats_stats.npz +3 -0
  10. gujarati/male/model/feats_type +1 -0
  11. gujarati/male/model/model.pth +3 -0
  12. gujarati/male/model/pitch_stats.npz +3 -0
  13. hifigan/LICENSE +21 -0
  14. hifigan/LJSpeech-1.1/training.txt +0 -0
  15. hifigan/LJSpeech-1.1/validation.txt +150 -0
  16. hifigan/README.md +105 -0
  17. hifigan/__init__.py +0 -0
  18. hifigan/__pycache__/__init__.cpython-37.pyc +0 -0
  19. hifigan/__pycache__/env.cpython-311.pyc +0 -0
  20. hifigan/__pycache__/env.cpython-37.pyc +0 -0
  21. hifigan/__pycache__/env.cpython-39.pyc +0 -0
  22. hifigan/__pycache__/meldataset.cpython-311.pyc +0 -0
  23. hifigan/__pycache__/meldataset.cpython-37.pyc +0 -0
  24. hifigan/__pycache__/meldataset.cpython-38.pyc +0 -0
  25. hifigan/__pycache__/meldataset.cpython-39.pyc +0 -0
  26. hifigan/__pycache__/models.cpython-311.pyc +0 -0
  27. hifigan/__pycache__/models.cpython-37.pyc +0 -0
  28. hifigan/__pycache__/models.cpython-39.pyc +0 -0
  29. hifigan/__pycache__/utils.cpython-311.pyc +0 -0
  30. hifigan/__pycache__/utils.cpython-37.pyc +0 -0
  31. hifigan/__pycache__/utils.cpython-39.pyc +0 -0
  32. hifigan/config.yaml +270 -0
  33. hifigan/config_v1.json +37 -0
  34. hifigan/config_v2.json +37 -0
  35. hifigan/config_v3.json +37 -0
  36. hifigan/denorm/test_243.npy.pt +3 -0
  37. hifigan/env.py +15 -0
  38. hifigan/fs2_speed.txt +24 -0
  39. hifigan/gen.wav +0 -0
  40. hifigan/griffin.wav +0 -0
  41. hifigan/hifigan_speed.txt +28 -0
  42. hifigan/inference.py +95 -0
  43. hifigan/inference_e2e.py +90 -0
  44. hifigan/inference_from_espnet.py +124 -0
  45. hifigan/meldataset.py +168 -0
  46. hifigan/models.py +283 -0
  47. hifigan/requirements.txt +7 -0
  48. hifigan/test_fs2_speed.py +14 -0
  49. hifigan/test_hifigan_speed.py +42 -0
  50. hifigan/test_tts_speed.py +45 -0
gujarati/female/model/config.yaml ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 35609
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 8
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param: []
65
+ ignore_init_mismatch: false
66
+ freeze_param: []
67
+ num_iters_per_epoch: 800
68
+ batch_size: 20
69
+ valid_batch_size: null
70
+ batch_bins: 3000000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp/tts_stats_raw_char_None/train/text_shape.char
74
+ - exp/tts_stats_raw_char_None/train/speech_shape
75
+ valid_shape_file:
76
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
77
+ - exp/tts_stats_raw_char_None/valid/speech_shape
78
+ batch_type: numel
79
+ valid_batch_type: null
80
+ fold_length:
81
+ - 150
82
+ - 204800
83
+ sort_in_batch: descending
84
+ sort_batch: descending
85
+ multiple_iterator: false
86
+ chunk_length: 500
87
+ chunk_shift_ratio: 0.5
88
+ num_cache_chunks: 1024
89
+ train_data_path_and_name_and_type:
90
+ - - dump/raw/tr_no_dev/text
91
+ - text
92
+ - text
93
+ - - duration_info/tr_no_dev/durations
94
+ - durations
95
+ - text_int
96
+ - - dump/raw/tr_no_dev/wav.scp
97
+ - speech
98
+ - sound
99
+ - - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
100
+ - pitch
101
+ - npy
102
+ - - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
103
+ - energy
104
+ - npy
105
+ valid_data_path_and_name_and_type:
106
+ - - dump/raw/dev/text
107
+ - text
108
+ - text
109
+ - - duration_info/dev/durations
110
+ - durations
111
+ - text_int
112
+ - - dump/raw/dev/wav.scp
113
+ - speech
114
+ - sound
115
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
116
+ - pitch
117
+ - npy
118
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
119
+ - energy
120
+ - npy
121
+ allow_variable_data_keys: false
122
+ max_cache_size: 0.0
123
+ max_cache_fd: 32
124
+ valid_max_cache_size: null
125
+ optim: adam
126
+ optim_conf:
127
+ lr: 1.0
128
+ scheduler: noamlr
129
+ scheduler_conf:
130
+ model_size: 384
131
+ warmup_steps: 4000
132
+ token_list:
133
+ - <blank>
134
+ - <unk>
135
+ - a
136
+ - A
137
+ - ','
138
+ - E
139
+ - r
140
+ - n
141
+ - I
142
+ - k
143
+ - o
144
+ - t
145
+ - m
146
+ - q
147
+ - w
148
+ - s
149
+ - p
150
+ - i
151
+ - y
152
+ - u
153
+ - l
154
+ - j
155
+ - h
156
+ - ट
157
+ - g
158
+ - d
159
+ - b
160
+ - $
161
+ - .
162
+ - श
163
+ - ड
164
+ - थ
165
+ - C
166
+ - ण
167
+ - c
168
+ - U
169
+ - ध
170
+ - B
171
+ - ख
172
+ - ള
173
+ - P
174
+ - ष
175
+ - J
176
+ - घ
177
+ - ठ
178
+ - R
179
+ - ऐ
180
+ - औ
181
+ - ढ
182
+ - ञ
183
+ - H
184
+ - ऑ
185
+ - ऍ
186
+ - M
187
+ - ॠ
188
+ - <sos/eos>
189
+ odim: null
190
+ model_conf: {}
191
+ use_preprocessor: true
192
+ token_type: char
193
+ bpemodel: null
194
+ non_linguistic_symbols: null
195
+ cleaner: null
196
+ g2p: g2p_en_no_space
197
+ feats_extract: fbank
198
+ feats_extract_conf:
199
+ n_fft: 1024
200
+ hop_length: 256
201
+ win_length: 1024
202
+ fs: 22050
203
+ fmin: 0
204
+ fmax: 8000
205
+ n_mels: 80
206
+ normalize: global_mvn
207
+ normalize_conf:
208
+ stats_file: /speech/arun/released_models/tts/female/gujarati/fastspeech2_hs/feats_stats.npz
209
+ tts: fastspeech2
210
+ tts_conf:
211
+ adim: 384
212
+ aheads: 2
213
+ elayers: 4
214
+ eunits: 1536
215
+ dlayers: 4
216
+ dunits: 1536
217
+ positionwise_layer_type: conv1d
218
+ positionwise_conv_kernel_size: 3
219
+ duration_predictor_layers: 2
220
+ duration_predictor_chans: 256
221
+ duration_predictor_kernel_size: 3
222
+ postnet_layers: 5
223
+ postnet_filts: 5
224
+ postnet_chans: 256
225
+ use_masking: true
226
+ use_scaled_pos_enc: true
227
+ encoder_normalize_before: true
228
+ decoder_normalize_before: true
229
+ reduction_factor: 1
230
+ init_type: xavier_uniform
231
+ init_enc_alpha: 1.0
232
+ init_dec_alpha: 1.0
233
+ transformer_enc_dropout_rate: 0.2
234
+ transformer_enc_positional_dropout_rate: 0.2
235
+ transformer_enc_attn_dropout_rate: 0.2
236
+ transformer_dec_dropout_rate: 0.2
237
+ transformer_dec_positional_dropout_rate: 0.2
238
+ transformer_dec_attn_dropout_rate: 0.2
239
+ pitch_predictor_layers: 5
240
+ pitch_predictor_chans: 256
241
+ pitch_predictor_kernel_size: 5
242
+ pitch_predictor_dropout: 0.5
243
+ pitch_embed_kernel_size: 1
244
+ pitch_embed_dropout: 0.0
245
+ stop_gradient_from_pitch_predictor: true
246
+ energy_predictor_layers: 2
247
+ energy_predictor_chans: 256
248
+ energy_predictor_kernel_size: 3
249
+ energy_predictor_dropout: 0.5
250
+ energy_embed_kernel_size: 1
251
+ energy_embed_dropout: 0.0
252
+ stop_gradient_from_energy_predictor: false
253
+ pitch_extract: dio
254
+ pitch_extract_conf:
255
+ fs: 22050
256
+ n_fft: 1024
257
+ hop_length: 256
258
+ f0max: 400
259
+ f0min: 80
260
+ reduction_factor: 1
261
+ pitch_normalize: global_mvn
262
+ pitch_normalize_conf:
263
+ stats_file: /speech/arun/released_models/tts/female/gujarati/fastspeech2_hs/pitch_stats.npz
264
+ energy_extract: energy
265
+ energy_extract_conf:
266
+ fs: 22050
267
+ n_fft: 1024
268
+ hop_length: 256
269
+ win_length: 1024
270
+ reduction_factor: 1
271
+ energy_normalize: global_mvn
272
+ energy_normalize_conf:
273
+ stats_file: /speech/arun/released_models/tts/female/gujarati/fastspeech2_hs/energy_stats.npz
274
+ required:
275
+ - output_dir
276
+ - token_list
277
+ version: 0.10.7a1
278
+ distributed: true
gujarati/female/model/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a4478a7089410d1efee3f85d49d2c54f6f10f832917843627e8592d92701d15
3
+ size 770
gujarati/female/model/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3d0136072e6aacf0744418f56b08ff518c410aaa3e58676542b923e91d3d21e
3
+ size 1402
gujarati/female/model/feats_type ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
gujarati/female/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aed9a0cdb4c0d5952fce48a389a22cd7ab693424b0ebc374ed1504485c5771a
3
+ size 148688073
gujarati/female/model/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d847be75d1ca0f0c0f12f2a58f8481f15d3d91169c2249cc7f8cb0fb21725a76
3
+ size 770
gujarati/male/model/config.yaml ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 32867
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 8
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 800
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 3000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_char_None/train/text_shape.char
72
+ - exp/tts_stats_raw_char_None/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
75
+ - exp/tts_stats_raw_char_None/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - duration_info/tr_no_dev/durations
92
+ - durations
93
+ - text_int
94
+ - - dump/raw/tr_no_dev/wav.scp
95
+ - speech
96
+ - sound
97
+ - - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
98
+ - pitch
99
+ - npy
100
+ - - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
101
+ - energy
102
+ - npy
103
+ valid_data_path_and_name_and_type:
104
+ - - dump/raw/dev/text
105
+ - text
106
+ - text
107
+ - - duration_info/dev/durations
108
+ - durations
109
+ - text_int
110
+ - - dump/raw/dev/wav.scp
111
+ - speech
112
+ - sound
113
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
114
+ - pitch
115
+ - npy
116
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
117
+ - energy
118
+ - npy
119
+ allow_variable_data_keys: false
120
+ max_cache_size: 0.0
121
+ max_cache_fd: 32
122
+ valid_max_cache_size: null
123
+ optim: adam
124
+ optim_conf:
125
+ lr: 1.0
126
+ scheduler: noamlr
127
+ scheduler_conf:
128
+ model_size: 384
129
+ warmup_steps: 4000
130
+ token_list:
131
+ - <blank>
132
+ - <unk>
133
+ - a
134
+ - A
135
+ - E
136
+ - ','
137
+ - r
138
+ - n
139
+ - I
140
+ - k
141
+ - o
142
+ - t
143
+ - m
144
+ - q
145
+ - w
146
+ - s
147
+ - p
148
+ - i
149
+ - y
150
+ - u
151
+ - l
152
+ - j
153
+ - h
154
+ - ट
155
+ - g
156
+ - d
157
+ - $
158
+ - .
159
+ - b
160
+ - श
161
+ - थ
162
+ - ड
163
+ - C
164
+ - ण
165
+ - c
166
+ - U
167
+ - ध
168
+ - B
169
+ - ख
170
+ - ള
171
+ - ष
172
+ - P
173
+ - घ
174
+ - J
175
+ - ठ
176
+ - R
177
+ - ऐ
178
+ - औ
179
+ - ढ
180
+ - ञ
181
+ - H
182
+ - ऑ
183
+ - ऍ
184
+ - M
185
+ - ॠ
186
+ - <sos/eos>
187
+ odim: null
188
+ model_conf: {}
189
+ use_preprocessor: true
190
+ token_type: char
191
+ bpemodel: null
192
+ non_linguistic_symbols: null
193
+ cleaner: null
194
+ g2p: g2p_en_no_space
195
+ feats_extract: fbank
196
+ feats_extract_conf:
197
+ n_fft: 1024
198
+ hop_length: 256
199
+ win_length: 1024
200
+ fs: 22050
201
+ fmin: 0
202
+ fmax: 8000
203
+ n_mels: 80
204
+ normalize: global_mvn
205
+ normalize_conf:
206
+ stats_file: /speech/arun/released_models/tts/male/gujarati/fastspeech2_hs/feats_stats.npz
207
+ tts: fastspeech2
208
+ tts_conf:
209
+ adim: 384
210
+ aheads: 2
211
+ elayers: 4
212
+ eunits: 1536
213
+ dlayers: 4
214
+ dunits: 1536
215
+ positionwise_layer_type: conv1d
216
+ positionwise_conv_kernel_size: 3
217
+ duration_predictor_layers: 2
218
+ duration_predictor_chans: 256
219
+ duration_predictor_kernel_size: 3
220
+ postnet_layers: 5
221
+ postnet_filts: 5
222
+ postnet_chans: 256
223
+ use_masking: true
224
+ use_scaled_pos_enc: true
225
+ encoder_normalize_before: true
226
+ decoder_normalize_before: true
227
+ reduction_factor: 1
228
+ init_type: xavier_uniform
229
+ init_enc_alpha: 1.0
230
+ init_dec_alpha: 1.0
231
+ transformer_enc_dropout_rate: 0.2
232
+ transformer_enc_positional_dropout_rate: 0.2
233
+ transformer_enc_attn_dropout_rate: 0.2
234
+ transformer_dec_dropout_rate: 0.2
235
+ transformer_dec_positional_dropout_rate: 0.2
236
+ transformer_dec_attn_dropout_rate: 0.2
237
+ pitch_predictor_layers: 5
238
+ pitch_predictor_chans: 256
239
+ pitch_predictor_kernel_size: 5
240
+ pitch_predictor_dropout: 0.5
241
+ pitch_embed_kernel_size: 1
242
+ pitch_embed_dropout: 0.0
243
+ stop_gradient_from_pitch_predictor: true
244
+ energy_predictor_layers: 2
245
+ energy_predictor_chans: 256
246
+ energy_predictor_kernel_size: 3
247
+ energy_predictor_dropout: 0.5
248
+ energy_embed_kernel_size: 1
249
+ energy_embed_dropout: 0.0
250
+ stop_gradient_from_energy_predictor: false
251
+ pitch_extract: dio
252
+ pitch_extract_conf:
253
+ fs: 22050
254
+ n_fft: 1024
255
+ hop_length: 256
256
+ f0max: 350
257
+ f0min: 40
258
+ reduction_factor: 1
259
+ pitch_normalize: global_mvn
260
+ pitch_normalize_conf:
261
+ stats_file: /speech/arun/released_models/tts/male/gujarati/fastspeech2_hs/pitch_stats.npz
262
+ energy_extract: energy
263
+ energy_extract_conf:
264
+ fs: 22050
265
+ n_fft: 1024
266
+ hop_length: 256
267
+ win_length: 1024
268
+ reduction_factor: 1
269
+ energy_normalize: global_mvn
270
+ energy_normalize_conf:
271
+ stats_file: /speech/arun/released_models/tts/male/gujarati/fastspeech2_hs/energy_stats.npz
272
+ required:
273
+ - output_dir
274
+ - token_list
275
+ version: 0.10.3a3
276
+ distributed: true
gujarati/male/model/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76c80d81162cb696809ef1de383612c18b2c593d8f633f2a40466adf7cbdde77
3
+ size 770
gujarati/male/model/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5c32a4e80ba2fc02765cd2e50a4b12b08c317ac3654c441564b982157121e95
3
+ size 1402
gujarati/male/model/feats_type ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
gujarati/male/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d65d3543fdbb027fbd8a71497ab5a672adbff5aa83e69a0265d478b65e72b719
3
+ size 148691959
gujarati/male/model/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f71113509921e3de8310812e1239bfe3427df0fd1a192a761d13aad1e902f867
3
+ size 770
hifigan/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Jungil Kong
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
hifigan/LJSpeech-1.1/training.txt ADDED
The diff for this file is too large to render. See raw diff
 
hifigan/LJSpeech-1.1/validation.txt ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LJ050-0269|The essential terms of such memoranda might well be embodied in an Executive order.|The essential terms of such memoranda might well be embodied in an Executive order.
2
+ LJ050-0270|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.
3
+ LJ050-0271|The demands on the President in the execution of His responsibilities in today's world are so varied and complex|The demands on the President in the execution of His responsibilities in today's world are so varied and complex
4
+ LJ050-0272|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.
5
+ LJ050-0273|The Commission has, however, from its examination of the facts of President Kennedy's assassination|The Commission has, however, from its examination of the facts of President Kennedy's assassination
6
+ LJ050-0274|made certain recommendations which it believes would, if adopted,|made certain recommendations which it believes would, if adopted,
7
+ LJ050-0275|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.
8
+ LJ050-0276|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,
9
+ LJ050-0277|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,
10
+ LJ050-0278|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.
11
+ LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.
12
+ LJ001-0068|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.
13
+ LJ002-0149|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.
14
+ LJ002-0157|and Susannah Evans, in October the same year, for 2 shillings, with costs of 6 shillings, 8 pence.|and Susannah Evans, in October the same year, for two shillings, with costs of six shillings, eight pence.
15
+ LJ002-0167|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.
16
+ LJ003-0042|The completion of this very necessary building was, however, much delayed for want of funds,|The completion of this very necessary building was, however, much delayed for want of funds,
17
+ LJ003-0307|but as yet no suggestion was made to provide prison uniform.|but as yet no suggestion was made to provide prison uniform.
18
+ LJ004-0169|On the dirty bedstead lay a wretched being in the throes of severe illness.|On the dirty bedstead lay a wretched being in the throes of severe illness.
19
+ LJ004-0233|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.
20
+ LJ005-0101|whence it deduced the practice and condition of every prison that replied.|whence it deduced the practice and condition of every prison that replied.
21
+ LJ005-0108|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,
22
+ LJ005-0202|An examination of this report shows how even the most insignificant township had its jail.|An examination of this report shows how even the most insignificant township had its jail.
23
+ LJ005-0234|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.
24
+ LJ005-0248|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.
25
+ LJ006-0001|The Chronicles of Newgate, Volume 2. By Arthur Griffiths. Section 9: The first report of the inspector of prisons.|The Chronicles of Newgate, Volume two. By Arthur Griffiths. Section nine: The first report of the inspector of prisons.
26
+ LJ006-0018|One was Mr. William Crawford, the other the Rev. Whitworth Russell.|One was Mr. William Crawford, the other the Rev. Whitworth Russell.
27
+ LJ006-0034|They attended early and late; they mustered the prisoners, examined into their condition,|They attended early and late; they mustered the prisoners, examined into their condition,
28
+ LJ006-0078|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.
29
+ LJ007-0217|They go on to say|They go on to say
30
+ LJ007-0243|It was not till the erection of the new prison at Holloway in 1850, and the entire internal reconstruction of Newgate according to new ideas,|It was not till the erection of the new prison at Holloway in eighteen fifty, and the entire internal reconstruction of Newgate according to new ideas,
31
+ LJ008-0087|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.
32
+ LJ008-0131|the other he kept between his hands.|the other he kept between his hands.
33
+ LJ008-0140|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,
34
+ LJ008-0158|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.
35
+ LJ008-0174|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.
36
+ LJ010-0047|while in 1850 Her Majesty was the victim of another outrage at the hands of one Pate.|while in eighteen fifty Her Majesty was the victim of another outrage at the hands of one Pate.
37
+ LJ010-0061|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.
38
+ LJ010-0105|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.
39
+ LJ010-0233|Here again probably it was partly the love of notoriety which was the incentive,|Here again probably it was partly the love of notoriety which was the incentive,
40
+ LJ010-0234|backed possibly with the hope that, as in a much more recent case,|backed possibly with the hope that, as in a much more recent case,
41
+ LJ010-0258|As the Queen was driving from Buckingham Palace to the Chapel Royal,|As the Queen was driving from Buckingham Palace to the Chapel Royal,
42
+ LJ010-0262|charged him with the offense.|charged him with the offense.
43
+ LJ010-0270|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.
44
+ LJ010-0293|I have already remarked that as violence was more and more eliminated from crimes against the person,|I have already remarked that as violence was more and more eliminated from crimes against the person,
45
+ LJ011-0009|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.
46
+ LJ011-0256|By this time the neighbors were aroused, and several people came to the scene of the affray.|By this time the neighbors were aroused, and several people came to the scene of the affray.
47
+ LJ012-0044|When his trade was busiest he set up a second establishment, at the head of which, although he was married,|When his trade was busiest he set up a second establishment, at the head of which, although he was married,
48
+ LJ012-0145|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.
49
+ LJ013-0020|he acted in a manner which excited the suspicions of the crew.|he acted in a manner which excited the suspicions of the crew.
50
+ LJ013-0077|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.
51
+ LJ013-0228|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.
52
+ LJ014-0020|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
53
+ LJ014-0054|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.
54
+ LJ014-0101|he found that it was soft and new, while elsewhere it was set and hard.|he found that it was soft and new, while elsewhere it was set and hard.
55
+ LJ014-0103|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.
56
+ LJ014-0263|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
57
+ LJ014-0272|and 1850 to embezzle and apply to his own purposes some £71,000.|and eighteen fifty to embezzle and apply to his own purposes some seventy-one thousand pounds.
58
+ LJ014-0311|His extensive business had been carried on by fraud.|His extensive business had been carried on by fraud.
59
+ LJ015-0197|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.
60
+ LJ016-0089|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.
61
+ LJ016-0407|who generally attended the prison services.|who generally attended the prison services.
62
+ LJ016-0443|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.
63
+ LJ017-0033|a medical practitioner, charged with doing to death persons who relied upon his professional skill.|a medical practitioner, charged with doing to death persons who relied upon his professional skill.
64
+ LJ017-0038|That the administration of justice should never be interfered with by local prejudice or local feeling|That the administration of justice should never be interfered with by local prejudice or local feeling
65
+ LJ018-0018|he wore gold-rimmed eye-glasses and a gold watch and chain.|he wore gold-rimmed eye-glasses and a gold watch and chain.
66
+ LJ018-0119|His offer was not, however, accepted.|His offer was not, however, accepted.
67
+ LJ018-0280|The commercial experience of these clever rogues was cosmopolitan.|The commercial experience of these clever rogues was cosmopolitan.
68
+ LJ019-0178|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.
69
+ LJ019-0240|But no structural alterations were made from the date first quoted until the time of closing the prison in 1881.|But no structural alterations were made from the date first quoted until the time of closing the prison in eighteen eighty-one.
70
+ LJ021-0049|and the curtailment of rank stock speculation through the Securities Exchange Act.|and the curtailment of rank stock speculation through the Securities Exchange Act.
71
+ LJ021-0155|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.
72
+ LJ022-0046|It is true that while business and industry are definitely better our relief rolls are still too large.|It is true that while business and industry are definitely better our relief rolls are still too large.
73
+ LJ022-0173|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,
74
+ LJ024-0087|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.
75
+ LJ024-0110|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay
76
+ LJ024-0119|When before have you found them really at your side in your fights for progress?|When before have you found them really at your side in your fights for progress?
77
+ LJ025-0091|as it was current among contemporary chemists.|as it was current among contemporary chemists.
78
+ LJ026-0029|so in the case under discussion.|so in the case under discussion.
79
+ LJ026-0039|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.
80
+ LJ026-0064|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.
81
+ LJ026-0105|This is done by diastase, an enzyme of plant cells.|This is done by diastase, an enzyme of plant cells.
82
+ LJ026-0137|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.
83
+ LJ027-0006|In all these lines the facts are drawn together by a strong thread of unity.|In all these lines the facts are drawn together by a strong thread of unity.
84
+ LJ028-0134|He also erected what is called a pensile paradise:|He also erected what is called a pensile paradise:
85
+ LJ028-0138|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,
86
+ LJ028-0189|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.
87
+ LJ028-0281|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,
88
+ LJ029-0188|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.
89
+ LJ030-0098|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,
90
+ LJ031-0007|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.
91
+ LJ031-0091|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.
92
+ LJ031-0227|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,
93
+ LJ032-0100|Marina Oswald|Marina Oswald
94
+ LJ032-0165|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.
95
+ LJ032-0198|During the period from March 2, 1963, to April 24, 1963,|During the period from March two, nineteen sixty-three, to April twenty-four, nineteen sixty-three,
96
+ LJ033-0046|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.
97
+ LJ033-0072|I then stepped off of it and the officer picked it up in the middle and it bent so.|I then stepped off of it and the officer picked it up in the middle and it bent so.
98
+ LJ033-0135|Location of Bag|Location of Bag
99
+ LJ034-0083|The significance of Givens' observation that Oswald was carrying his clipboard|The significance of Givens' observation that Oswald was carrying his clipboard
100
+ LJ034-0179|and, quote, seemed to be sitting a little forward, end quote,|and, quote, seemed to be sitting a little forward, end quote,
101
+ LJ035-0125|Victoria Adams, who worked on the fourth floor of the Depository Building,|Victoria Adams, who worked on the fourth floor of the Depository Building,
102
+ LJ035-0162|approximately 30 to 45 seconds after Oswald's lunchroom encounter with Baker and Truly.|approximately thirty to forty-five seconds after Oswald's lunchroom encounter with Baker and Truly.
103
+ LJ035-0189|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,
104
+ LJ035-0208|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor
105
+ LJ036-0216|Tippit got out and started to walk around the front of the car|Tippit got out and started to walk around the front of the car
106
+ LJ037-0093|William Arthur Smith was about a block east of 10th and Patton when he heard shots.|William Arthur Smith was about a block east of tenth and Patton when he heard shots.
107
+ LJ037-0157|taken from Oswald.|taken from Oswald.
108
+ LJ037-0178|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,
109
+ LJ037-0219|Oswald's Jacket|Oswald's Jacket
110
+ LJ037-0222|When Oswald was arrested, he did not have a jacket.|When Oswald was arrested, he did not have a jacket.
111
+ LJ038-0017|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.
112
+ LJ038-0052|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.
113
+ LJ038-0077|Statements of Oswald during Detention.|Statements of Oswald during Detention.
114
+ LJ038-0161|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.
115
+ LJ038-0212|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.
116
+ LJ039-0103|Oswald, like all Marine recruits, received training on the rifle range at distances up to 500 yards,|Oswald, like all Marine recruits, received training on the rifle range at distances up to five hundred yards,
117
+ LJ039-0149|established that they had been previously loaded and ejected from the assassination rifle,|established that they had been previously loaded and ejected from the assassination rifle,
118
+ LJ040-0107|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of 5 and 7 years,|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of five and seven years,
119
+ LJ040-0119|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.
120
+ LJ040-0161|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.
121
+ LJ040-0169|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone
122
+ LJ041-0098|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.
123
+ LJ042-0017|and see for himself how a revolutionary society operates, a Marxist society.|and see for himself how a revolutionary society operates, a Marxist society.
124
+ LJ042-0070|Oswald was discovered in time to thwart his attempt at suicide.|Oswald was discovered in time to thwart his attempt at suicide.
125
+ LJ042-0161|Immediately after serving out his 3 years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.|Immediately after serving out his three years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.
126
+ LJ043-0147|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.
127
+ LJ043-0178|as, in fact, one of them did appear after the assassination.|as, in fact, one of them did appear after the assassination.
128
+ LJ043-0183|Oswald did not lack the determination and other traits required|Oswald did not lack the determination and other traits required
129
+ LJ043-0185|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.
130
+ LJ044-0057|extensive investigation was not able to connect Oswald with that address, although it did develop the fact|extensive investigation was not able to connect Oswald with that address, although it did develop the fact
131
+ LJ044-0109|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.
132
+ LJ045-0081|Although she denied it in some of her testimony before the Commission,|Although she denied it in some of her testimony before the Commission,
133
+ LJ045-0147|She asked Oswald, quote,|She asked Oswald, quote,
134
+ LJ045-0204|he had never found anything to which he felt he could really belong.|he had never found anything to which he felt he could really belong.
135
+ LJ046-0193|and 12 to 15 of these cases as highly dangerous risks.|and twelve to fifteen of these cases as highly dangerous risks.
136
+ LJ046-0244|PRS should have investigated and been prepared to guard against it.|PRS should have investigated and been prepared to guard against it.
137
+ LJ047-0059|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,
138
+ LJ047-0142|The Bureau had no earlier information suggesting that Oswald had left the United States.|The Bureau had no earlier information suggesting that Oswald had left the United States.
139
+ LJ048-0035|It was against this background and consistent with the criteria followed by the FBI prior to November 22|It was against this background and consistent with the criteria followed by the FBI prior to November twenty-two
140
+ LJ048-0063|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.
141
+ LJ048-0104|There were far safer routes via freeways directly to the Trade Mart,|There were far safer routes via freeways directly to the Trade Mart,
142
+ LJ048-0187|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.
143
+ LJ048-0271|will be cause for removal from the Service, end quote.|will be cause for removal from the Service, end quote.
144
+ LJ049-0031|The Presidential vehicle in use in Dallas, described in chapter 2,|The Presidential vehicle in use in Dallas, described in chapter two,
145
+ LJ049-0059|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,
146
+ LJ049-0174|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated
147
+ LJ050-0049|and from a specialist in psychiatric prognostication at Walter Reed Hospital.|and from a specialist in psychiatric prognostication at Walter Reed Hospital.
148
+ LJ050-0113|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,
149
+ LJ050-0150|Its present manual filing system is obsolete;|Its present manual filing system is obsolete;
150
+ LJ050-0189|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.
hifigan/README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
2
+
3
+ ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
4
+
5
+ In our [paper](https://arxiv.org/abs/2010.05646),
6
+ we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
7
+ We provide our implementation and pretrained models as open source in this repository.
8
+
9
+ **Abstract :**
10
+ Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
11
+ Although such methods improve the sampling efficiency and memory usage,
12
+ their sample quality has not yet reached that of autoregressive and flow-based generative models.
13
+ In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
14
+ As speech audio consists of sinusoidal signals with various periods,
15
+ we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
16
+ A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
17
+ demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
18
+ real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
19
+ speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
20
+ faster than real-time on CPU with comparable quality to an autoregressive counterpart.
21
+
22
+ Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
23
+
24
+
25
+ ## Pre-requisites
26
+ 1. Python >= 3.6
27
+ 2. Clone this repository.
28
+ 3. Install python requirements. Please refer [requirements.txt](requirements.txt)
29
+ 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
30
+ And move all wav files to `LJSpeech-1.1/wavs`
31
+
32
+
33
+ ## Training
34
+ ```
35
+ python train.py --config config_v1.json
36
+ ```
37
+ To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
38
+ Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
39
+ You can change the path by adding `--checkpoint_path` option.
40
+
41
+ Validation loss during training with V1 generator.<br>
42
+ ![validation loss](./validation_loss.png)
43
+
44
+ ## Pretrained Model
45
+ You can also use pretrained models we provide.<br/>
46
+ [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
47
+ Details of each folder are as in follows:
48
+
49
+ |Folder Name|Generator|Dataset|Fine-Tuned|
50
+ |------|---|---|---|
51
+ |LJ_V1|V1|LJSpeech|No|
52
+ |LJ_V2|V2|LJSpeech|No|
53
+ |LJ_V3|V3|LJSpeech|No|
54
+ |LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
55
+ |LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
56
+ |LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
57
+ |VCTK_V1|V1|VCTK|No|
58
+ |VCTK_V2|V2|VCTK|No|
59
+ |VCTK_V3|V3|VCTK|No|
60
+ |UNIVERSAL_V1|V1|Universal|No|
61
+
62
+ We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
63
+
64
+ ## Fine-Tuning
65
+ 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
66
+ The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
67
+ Example:
68
+ ```
69
+ Audio File : LJ001-0001.wav
70
+ Mel-Spectrogram File : LJ001-0001.npy
71
+ ```
72
+ 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
73
+ 3. Run the following command.
74
+ ```
75
+ python train.py --fine_tuning True --config config_v1.json
76
+ ```
77
+ For other command line options, please refer to the training section.
78
+
79
+
80
+ ## Inference from wav file
81
+ 1. Make `test_files` directory and copy wav files into the directory.
82
+ 2. Run the following command.
83
+ ```
84
+ python inference.py --checkpoint_file [generator checkpoint file path]
85
+ ```
86
+ Generated wav files are saved in `generated_files` by default.<br>
87
+ You can change the path by adding `--output_dir` option.
88
+
89
+
90
+ ## Inference for end-to-end speech synthesis
91
+ 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
92
+ You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
93
+ [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
94
+ 2. Run the following command.
95
+ ```
96
+ python inference_e2e.py --checkpoint_file [generator checkpoint file path]
97
+ ```
98
+ Generated wav files are saved in `generated_files_from_mel` by default.<br>
99
+ You can change the path by adding `--output_dir` option.
100
+
101
+
102
+ ## Acknowledgements
103
+ We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
104
+ and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
105
+
hifigan/__init__.py ADDED
File without changes
hifigan/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (131 Bytes). View file
 
hifigan/__pycache__/env.cpython-311.pyc ADDED
Binary file (1.32 kB). View file
 
hifigan/__pycache__/env.cpython-37.pyc ADDED
Binary file (749 Bytes). View file
 
hifigan/__pycache__/env.cpython-39.pyc ADDED
Binary file (785 Bytes). View file
 
hifigan/__pycache__/meldataset.cpython-311.pyc ADDED
Binary file (11.7 kB). View file
 
hifigan/__pycache__/meldataset.cpython-37.pyc ADDED
Binary file (5.38 kB). View file
 
hifigan/__pycache__/meldataset.cpython-38.pyc ADDED
Binary file (5.45 kB). View file
 
hifigan/__pycache__/meldataset.cpython-39.pyc ADDED
Binary file (5.46 kB). View file
 
hifigan/__pycache__/models.cpython-311.pyc ADDED
Binary file (19.1 kB). View file
 
hifigan/__pycache__/models.cpython-37.pyc ADDED
Binary file (8.9 kB). View file
 
hifigan/__pycache__/models.cpython-39.pyc ADDED
Binary file (8.7 kB). View file
 
hifigan/__pycache__/utils.cpython-311.pyc ADDED
Binary file (3.46 kB). View file
 
hifigan/__pycache__/utils.cpython-37.pyc ADDED
Binary file (1.88 kB). View file
 
hifigan/__pycache__/utils.cpython-39.pyc ADDED
Binary file (1.99 kB). View file
 
hifigan/config.yaml ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 3
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 52297
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 8
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 800
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 3000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_char_None/train/text_shape.char
72
+ - exp/tts_stats_raw_char_None/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
75
+ - exp/tts_stats_raw_char_None/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - exp/tts_train_raw_char_None/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
92
+ - durations
93
+ - text_int
94
+ - - dump/raw/tr_no_dev/wav.scp
95
+ - speech
96
+ - sound
97
+ valid_data_path_and_name_and_type:
98
+ - - dump/raw/dev/text
99
+ - text
100
+ - text
101
+ - - exp/tts_train_raw_char_None/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
102
+ - durations
103
+ - text_int
104
+ - - dump/raw/dev/wav.scp
105
+ - speech
106
+ - sound
107
+ allow_variable_data_keys: false
108
+ max_cache_size: 0.0
109
+ max_cache_fd: 32
110
+ valid_max_cache_size: null
111
+ optim: adam
112
+ optim_conf:
113
+ lr: 1.0
114
+ scheduler: noamlr
115
+ scheduler_conf:
116
+ model_size: 384
117
+ warmup_steps: 4000
118
+ token_list:
119
+ - <blank>
120
+ - <unk>
121
+ - <space>
122
+ - a
123
+ - A
124
+ - E
125
+ - k
126
+ - r
127
+ - I
128
+ - n
129
+ - s
130
+ - h
131
+ - i
132
+ - q
133
+ - t
134
+ - m
135
+ - o
136
+ - l
137
+ - p
138
+ - u
139
+ - y
140
+ - b
141
+ - d
142
+ - w
143
+ - ऐ
144
+ - g
145
+ - j
146
+ - c
147
+ - ट
148
+ - थ
149
+ - श
150
+ - U
151
+ - B
152
+ - औ
153
+ - ख
154
+ - ड
155
+ - z
156
+ - ध
157
+ - D
158
+ - f
159
+ - C
160
+ - M
161
+ - ष
162
+ - ण
163
+ - ठ
164
+ - J
165
+ - घ
166
+ - ऑ
167
+ - P
168
+ - क
169
+ - R
170
+ - T
171
+ - K
172
+ - ढ
173
+ - G
174
+ - ञ
175
+ - H
176
+ - ङ
177
+ - Y
178
+ - ऍ
179
+ - र
180
+ - <sos/eos>
181
+ odim: null
182
+ model_conf: {}
183
+ use_preprocessor: true
184
+ token_type: char
185
+ bpemodel: null
186
+ non_linguistic_symbols: null
187
+ cleaner: null
188
+ g2p: g2p_en_no_space
189
+ feats_extract: fbank
190
+ feats_extract_conf:
191
+ n_fft: 1024
192
+ hop_length: 256
193
+ win_length: 1024
194
+ fs: 22050
195
+ fmin: 0
196
+ fmax: 8000
197
+ n_mels: 80
198
+ normalize: global_mvn
199
+ normalize_conf:
200
+ stats_file: /var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/feats_stats.npz
201
+ tts: fastspeech2
202
+ tts_conf:
203
+ adim: 384
204
+ aheads: 2
205
+ elayers: 4
206
+ eunits: 1536
207
+ dlayers: 4
208
+ dunits: 1536
209
+ positionwise_layer_type: conv1d
210
+ positionwise_conv_kernel_size: 3
211
+ duration_predictor_layers: 2
212
+ duration_predictor_chans: 256
213
+ duration_predictor_kernel_size: 3
214
+ postnet_layers: 5
215
+ postnet_filts: 5
216
+ postnet_chans: 256
217
+ use_masking: true
218
+ use_scaled_pos_enc: true
219
+ encoder_normalize_before: true
220
+ decoder_normalize_before: true
221
+ reduction_factor: 1
222
+ init_type: xavier_uniform
223
+ init_enc_alpha: 1.0
224
+ init_dec_alpha: 1.0
225
+ transformer_enc_dropout_rate: 0.2
226
+ transformer_enc_positional_dropout_rate: 0.2
227
+ transformer_enc_attn_dropout_rate: 0.2
228
+ transformer_dec_dropout_rate: 0.2
229
+ transformer_dec_positional_dropout_rate: 0.2
230
+ transformer_dec_attn_dropout_rate: 0.2
231
+ pitch_predictor_layers: 5
232
+ pitch_predictor_chans: 256
233
+ pitch_predictor_kernel_size: 5
234
+ pitch_predictor_dropout: 0.5
235
+ pitch_embed_kernel_size: 1
236
+ pitch_embed_dropout: 0.0
237
+ stop_gradient_from_pitch_predictor: true
238
+ energy_predictor_layers: 2
239
+ energy_predictor_chans: 256
240
+ energy_predictor_kernel_size: 3
241
+ energy_predictor_dropout: 0.5
242
+ energy_embed_kernel_size: 1
243
+ energy_embed_dropout: 0.0
244
+ stop_gradient_from_energy_predictor: false
245
+ pitch_extract: dio
246
+ pitch_extract_conf:
247
+ fs: 22050
248
+ n_fft: 1024
249
+ hop_length: 256
250
+ f0max: 350
251
+ f0min: 40
252
+ reduction_factor: 1
253
+ pitch_normalize: global_mvn
254
+ pitch_normalize_conf:
255
+ stats_file: /var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/pitch_stats.npz
256
+ energy_extract: energy
257
+ energy_extract_conf:
258
+ fs: 22050
259
+ n_fft: 1024
260
+ hop_length: 256
261
+ win_length: 1024
262
+ reduction_factor: 1
263
+ energy_normalize: global_mvn
264
+ energy_normalize_conf:
265
+ stats_file: /var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/energy_stats.npz
266
+ required:
267
+ - output_dir
268
+ - token_list
269
+ version: 0.10.3a3
270
+ distributed: true
hifigan/config_v1.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 0,
4
+ "batch_size": 16,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [8,8,2,2],
12
+ "upsample_kernel_sizes": [16,16,4,4],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "segment_size": 8192,
18
+ "num_mels": 80,
19
+ "num_freq": 1025,
20
+ "n_fft": 1024,
21
+ "hop_size": 256,
22
+ "win_size": 1024,
23
+
24
+ "sampling_rate": 22050,
25
+
26
+ "fmin": 0,
27
+ "fmax": 8000,
28
+ "fmax_for_loss": null,
29
+
30
+ "num_workers": 4,
31
+
32
+ "dist_config": {
33
+ "dist_backend": "nccl",
34
+ "dist_url": "tcp://localhost:54321",
35
+ "world_size": 1
36
+ }
37
+ }
hifigan/config_v2.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 0,
4
+ "batch_size": 16,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [8,8,2,2],
12
+ "upsample_kernel_sizes": [16,16,4,4],
13
+ "upsample_initial_channel": 128,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "segment_size": 8192,
18
+ "num_mels": 80,
19
+ "num_freq": 1025,
20
+ "n_fft": 1024,
21
+ "hop_size": 256,
22
+ "win_size": 1024,
23
+
24
+ "sampling_rate": 22050,
25
+
26
+ "fmin": 0,
27
+ "fmax": 8000,
28
+ "fmax_for_loss": null,
29
+
30
+ "num_workers": 4,
31
+
32
+ "dist_config": {
33
+ "dist_backend": "nccl",
34
+ "dist_url": "tcp://localhost:54321",
35
+ "world_size": 1
36
+ }
37
+ }
hifigan/config_v3.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "2",
3
+ "num_gpus": 0,
4
+ "batch_size": 16,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [8,8,4],
12
+ "upsample_kernel_sizes": [16,16,8],
13
+ "upsample_initial_channel": 256,
14
+ "resblock_kernel_sizes": [3,5,7],
15
+ "resblock_dilation_sizes": [[1,2], [2,6], [3,12]],
16
+
17
+ "segment_size": 8192,
18
+ "num_mels": 80,
19
+ "num_freq": 1025,
20
+ "n_fft": 1024,
21
+ "hop_size": 256,
22
+ "win_size": 1024,
23
+
24
+ "sampling_rate": 22050,
25
+
26
+ "fmin": 0,
27
+ "fmax": 8000,
28
+ "fmax_for_loss": null,
29
+
30
+ "num_workers": 4,
31
+
32
+ "dist_config": {
33
+ "dist_backend": "nccl",
34
+ "dist_url": "tcp://localhost:54321",
35
+ "world_size": 1
36
+ }
37
+ }
hifigan/denorm/test_243.npy.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eb0656828dbbba211b8646a55909806cf622c6c0c4969abc12433b49fe674cb
3
+ size 70730
hifigan/env.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+
5
+ class AttrDict(dict):
6
+ def __init__(self, *args, **kwargs):
7
+ super(AttrDict, self).__init__(*args, **kwargs)
8
+ self.__dict__ = self
9
+
10
+
11
+ def build_env(config, config_name, path):
12
+ t_path = os.path.join(path, config_name)
13
+ if config != t_path:
14
+ os.makedirs(path, exist_ok=True)
15
+ shutil.copyfile(config, os.path.join(path, config_name))
hifigan/fs2_speed.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ loading model in cpu
2
+ Run 0
3
+ Elapsed time: 0.38950467109680176
4
+ Run 1
5
+ Elapsed time: 0.1787424087524414
6
+ Run 2
7
+ Elapsed time: 0.18103241920471191
8
+ Run 3
9
+ Elapsed time: 0.18195390701293945
10
+ Run 4
11
+ Elapsed time: 0.18042469024658203
12
+ -----------------------------
13
+ loading model in cuda
14
+ Run 0
15
+ Elapsed time: 84.3974118232727
16
+ Run 1
17
+ Elapsed time: 0.12549662590026855
18
+ Run 2
19
+ Elapsed time: 0.12475895881652832
20
+ Run 3
21
+ Elapsed time: 0.12504363059997559
22
+ Run 4
23
+ Elapsed time: 0.12546324729919434
24
+ -----------------------------
hifigan/gen.wav ADDED
Binary file (262 kB). View file
 
hifigan/griffin.wav ADDED
Binary file (176 kB). View file
 
hifigan/hifigan_speed.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ loading model in cpu
2
+ Removing weight norm...
3
+ Run 0
4
+ Elapsed time: 0.453446626663208
5
+ Elapsed time: 2.7247982025146484
6
+ Elapsed time: 5.03496241569519
7
+ Run 1
8
+ Elapsed time: 0.5230855941772461
9
+ Elapsed time: 2.5505268573760986
10
+ Elapsed time: 4.904325246810913
11
+ Run 2
12
+ Elapsed time: 0.5279533863067627
13
+ Elapsed time: 2.5415592193603516
14
+ Elapsed time: 4.775323390960693
15
+ loading model in cuda
16
+ Removing weight norm...
17
+ Run 0
18
+ Elapsed time: 116.25620722770691
19
+ Elapsed time: 0.08193731307983398
20
+ Elapsed time: 0.15532135963439941
21
+ Run 1
22
+ Elapsed time: 0.020008563995361328
23
+ Elapsed time: 0.07747459411621094
24
+ Elapsed time: 0.1503896713256836
25
+ Run 2
26
+ Elapsed time: 0.019192934036254883
27
+ Elapsed time: 0.07719159126281738
28
+ Elapsed time: 0.15003252029418945
hifigan/inference.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, print_function, unicode_literals
2
+
3
+ import glob
4
+ import os
5
+ import argparse
6
+ import json
7
+ import torch
8
+ from scipy.io.wavfile import write
9
+ from env import AttrDict
10
+ from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
11
+ from models import Generator
12
+
13
+ h = None
14
+ device = None
15
+
16
+
17
+ def load_checkpoint(filepath, device):
18
+ assert os.path.isfile(filepath)
19
+ print("Loading '{}'".format(filepath))
20
+ checkpoint_dict = torch.load(filepath, map_location=device)
21
+ print("Complete.")
22
+ return checkpoint_dict
23
+
24
+
25
+ def get_mel(x):
26
+ return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
27
+
28
+
29
+ def scan_checkpoint(cp_dir, prefix):
30
+ pattern = os.path.join(cp_dir, prefix + '*')
31
+ cp_list = glob.glob(pattern)
32
+ if len(cp_list) == 0:
33
+ return ''
34
+ return sorted(cp_list)[-1]
35
+
36
+
37
+ def inference(a):
38
+ generator = Generator(h).to(device)
39
+
40
+ state_dict_g = load_checkpoint(a.checkpoint_file, device)
41
+ generator.load_state_dict(state_dict_g['generator'])
42
+
43
+ filelist = os.listdir(a.input_wavs_dir)
44
+
45
+ os.makedirs(a.output_dir, exist_ok=True)
46
+
47
+ generator.eval()
48
+ generator.remove_weight_norm()
49
+ with torch.no_grad():
50
+ for i, filname in enumerate(filelist):
51
+ wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname))
52
+ wav = wav / MAX_WAV_VALUE
53
+ wav = torch.FloatTensor(wav).to(device)
54
+ x = get_mel(wav.unsqueeze(0))
55
+ y_g_hat = generator(x)
56
+ audio = y_g_hat.squeeze()
57
+ audio = audio * MAX_WAV_VALUE
58
+ audio = audio.cpu().numpy().astype('int16')
59
+
60
+ output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav')
61
+ write(output_file, h.sampling_rate, audio)
62
+ print(output_file)
63
+
64
+
65
+ def main():
66
+ print('Initializing Inference Process..')
67
+
68
+ parser = argparse.ArgumentParser()
69
+ parser.add_argument('--input_wavs_dir', default='test_files')
70
+ parser.add_argument('--output_dir', default='generated_files')
71
+ parser.add_argument('--checkpoint_file', required=True)
72
+ a = parser.parse_args()
73
+
74
+ config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
75
+ with open(config_file) as f:
76
+ data = f.read()
77
+
78
+ global h
79
+ json_config = json.loads(data)
80
+ h = AttrDict(json_config)
81
+
82
+ torch.manual_seed(h.seed)
83
+ global device
84
+ if torch.cuda.is_available():
85
+ torch.cuda.manual_seed(h.seed)
86
+ device = torch.device('cuda')
87
+ else:
88
+ device = torch.device('cpu')
89
+
90
+ inference(a)
91
+
92
+
93
+ if __name__ == '__main__':
94
+ main()
95
+
hifigan/inference_e2e.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, print_function, unicode_literals
2
+
3
+ import glob
4
+ import os
5
+ import numpy as np
6
+ import argparse
7
+ import json
8
+ import torch
9
+ from scipy.io.wavfile import write
10
+ from env import AttrDict
11
+ from meldataset import MAX_WAV_VALUE
12
+ from models import Generator
13
+
14
+ h = None
15
+ device = None
16
+
17
+
18
+ def load_checkpoint(filepath, device):
19
+ assert os.path.isfile(filepath)
20
+ print("Loading '{}'".format(filepath))
21
+ checkpoint_dict = torch.load(filepath, map_location=device)
22
+ print("Complete.")
23
+ return checkpoint_dict
24
+
25
+
26
+ def scan_checkpoint(cp_dir, prefix):
27
+ pattern = os.path.join(cp_dir, prefix + '*')
28
+ cp_list = glob.glob(pattern)
29
+ if len(cp_list) == 0:
30
+ return ''
31
+ return sorted(cp_list)[-1]
32
+
33
+
34
+ def inference(a):
35
+ generator = Generator(h).to(device)
36
+
37
+ state_dict_g = load_checkpoint(a.checkpoint_file, device)
38
+ generator.load_state_dict(state_dict_g['generator'])
39
+
40
+ filelist = os.listdir(a.input_mels_dir)
41
+
42
+ os.makedirs(a.output_dir, exist_ok=True)
43
+
44
+ generator.eval()
45
+ generator.remove_weight_norm()
46
+ with torch.no_grad():
47
+ for i, filname in enumerate(filelist):
48
+ x = np.load(os.path.join(a.input_mels_dir, filname))
49
+ x = torch.FloatTensor(x).to(device)
50
+ y_g_hat = generator(x)
51
+ audio = y_g_hat.squeeze()
52
+ audio = audio * MAX_WAV_VALUE
53
+ audio = audio.cpu().numpy().astype('int16')
54
+
55
+ output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav')
56
+ write(output_file, h.sampling_rate, audio)
57
+ print(output_file)
58
+
59
+
60
+ def main():
61
+ print('Initializing Inference Process..')
62
+
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument('--input_mels_dir', default='test_mel_files')
65
+ parser.add_argument('--output_dir', default='generated_files_from_mel')
66
+ parser.add_argument('--checkpoint_file', required=True)
67
+ a = parser.parse_args()
68
+
69
+ config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
70
+ with open(config_file) as f:
71
+ data = f.read()
72
+
73
+ global h
74
+ json_config = json.loads(data)
75
+ h = AttrDict(json_config)
76
+
77
+ torch.manual_seed(h.seed)
78
+ global device
79
+ if torch.cuda.is_available():
80
+ torch.cuda.manual_seed(h.seed)
81
+ device = torch.device('cuda')
82
+ else:
83
+ device = torch.device('cpu')
84
+
85
+ inference(a)
86
+
87
+
88
+ if __name__ == '__main__':
89
+ main()
90
+
hifigan/inference_from_espnet.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import, division, print_function, unicode_literals
2
+ import glob
3
+ import os
4
+ import argparse
5
+ import json
6
+ import torch
7
+ import numpy as np
8
+ from scipy.io.wavfile import write
9
+ from env import AttrDict
10
+ from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
11
+ from models import Generator
12
+ import time
13
+
14
+ h = None
15
+ device = "cpu"
16
+
17
+
18
+ def load_checkpoint(filepath, device):
19
+ assert os.path.isfile(filepath)
20
+ print("Loading '{}'".format(filepath))
21
+ checkpoint_dict = torch.load(filepath, map_location=device)
22
+ print("Complete.")
23
+ return checkpoint_dict
24
+
25
+
26
+ def get_mel(x):
27
+ return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
28
+
29
+
30
+ def scan_checkpoint(cp_dir, prefix):
31
+ pattern = os.path.join(cp_dir, prefix + '*')
32
+ cp_list = glob.glob(pattern)
33
+ if len(cp_list) == 0:
34
+ return ''
35
+ return sorted(cp_list)[-1]
36
+
37
+
38
+ def inference(a):
39
+ generator = Generator(h).to(device)
40
+
41
+ state_dict_g = load_checkpoint(a.checkpoint_file, device)
42
+ generator.load_state_dict(state_dict_g['generator'])
43
+
44
+ filelist = os.listdir(a.input_wavs_dir)
45
+
46
+ os.makedirs(a.output_dir, exist_ok=True)
47
+
48
+ generator.eval()
49
+ generator.remove_weight_norm()
50
+ with torch.no_grad():
51
+ for i, filname in enumerate(filelist):
52
+ print(filname)
53
+ # wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname))
54
+ # wav = wav / MAX_WAV_VALUE
55
+ # wav = torch.FloatTensor(wav).to(device)
56
+ # x = get_mel(wav.unsqueeze(0))
57
+ # print("x is ", x.shape)
58
+ arr2 = torch.load(os.path.join(a.input_wavs_dir, filname))
59
+ print("arr2 type", type(arr2))
60
+ # arr = np.load(os.path.join(a.input_wavs_dir, filname))
61
+ arr = np.array(arr2).astype(float)
62
+ print("arr type", type(arr))
63
+ # arr = np.loadtxt(os.path.join(a.input_wavs_dir, filname),dtype='float')
64
+ if arr.shape[0]!=80:
65
+ arr = arr.T
66
+ print(arr.shape)
67
+ # arr = x.detach().cpu().numpy()
68
+ # print(arr.shape[0],arr.shape[1],arr.shape[2])
69
+ # arr_new = arr.reshape(arr.shape[1],arr.shape[2])
70
+ # print(arr_new.shape)
71
+ arr_new2 = arr.reshape(1,arr.shape[0],arr.shape[1])
72
+ ###x_new = torch.from_numpy(arr_new2).float().to(device)
73
+ x_new = torch.FloatTensor(arr_new2).to(device)
74
+ print("x_new",x_new.shape)
75
+ # x = x_new
76
+ # np.savetxt('tests/' + filname + '.txt', arr_new)
77
+ # y_new = torch.from_numpy(arr.unsqueeze(0))
78
+ # print(y_new.shape)
79
+
80
+ st = time.time()
81
+ y_g_hat = generator(x_new)
82
+ et = time.time()
83
+ print("Time taken by generator:", (et-st))
84
+ audio = y_g_hat.squeeze()
85
+ audio = audio * MAX_WAV_VALUE
86
+ audio = audio.cpu().numpy().astype('int16')
87
+
88
+ output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav')
89
+ write(output_file, h.sampling_rate, audio)
90
+ print(output_file)
91
+
92
+
93
+ def main():
94
+ print('Initializing Inference Process..')
95
+
96
+ parser = argparse.ArgumentParser()
97
+ parser.add_argument('--input_wavs_dir', default='denorm')
98
+ parser.add_argument('--output_dir', default='wav_folder')
99
+ parser.add_argument('--checkpoint_file', required=True)
100
+ a = parser.parse_args()
101
+
102
+ config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
103
+ with open(config_file) as f:
104
+ data = f.read()
105
+
106
+ global h
107
+ json_config = json.loads(data)
108
+ h = AttrDict(json_config)
109
+
110
+ torch.manual_seed(h.seed)
111
+ global device
112
+ if device is None and torch.cuda.is_available():
113
+ torch.cuda.manual_seed(h.seed)
114
+ device = torch.device('cuda')
115
+ else:
116
+ device = torch.device('cpu')
117
+
118
+ print("device", device)
119
+ inference(a)
120
+
121
+
122
+ if __name__ == '__main__':
123
+ main()
124
+
hifigan/meldataset.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ import torch.utils.data
6
+ import numpy as np
7
+ from librosa.util import normalize
8
+ from scipy.io.wavfile import read
9
+ from librosa.filters import mel as librosa_mel_fn
10
+
11
+ MAX_WAV_VALUE = 32768.0
12
+
13
+
14
+ def load_wav(full_path):
15
+ sampling_rate, data = read(full_path)
16
+ return data, sampling_rate
17
+
18
+
19
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
20
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
21
+
22
+
23
+ def dynamic_range_decompression(x, C=1):
24
+ return np.exp(x) / C
25
+
26
+
27
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
28
+ return torch.log(torch.clamp(x, min=clip_val) * C)
29
+
30
+
31
+ def dynamic_range_decompression_torch(x, C=1):
32
+ return torch.exp(x) / C
33
+
34
+
35
+ def spectral_normalize_torch(magnitudes):
36
+ output = dynamic_range_compression_torch(magnitudes)
37
+ return output
38
+
39
+
40
+ def spectral_de_normalize_torch(magnitudes):
41
+ output = dynamic_range_decompression_torch(magnitudes)
42
+ return output
43
+
44
+
45
+ mel_basis = {}
46
+ hann_window = {}
47
+
48
+
49
+ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
50
+ if torch.min(y) < -1.:
51
+ print('min value is ', torch.min(y))
52
+ if torch.max(y) > 1.:
53
+ print('max value is ', torch.max(y))
54
+
55
+ global mel_basis, hann_window
56
+ if fmax not in mel_basis:
57
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
58
+ mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
59
+ hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
60
+
61
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
62
+ y = y.squeeze(1)
63
+
64
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
65
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
66
+
67
+ spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
68
+
69
+ spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
70
+ spec = spectral_normalize_torch(spec)
71
+
72
+ return spec
73
+
74
+
75
+ def get_dataset_filelist(a):
76
+ with open(a.input_training_file, 'r', encoding='utf-8') as fi:
77
+ training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
78
+ for x in fi.read().split('\n') if len(x) > 0]
79
+
80
+ with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
81
+ validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
82
+ for x in fi.read().split('\n') if len(x) > 0]
83
+ return training_files, validation_files
84
+
85
+
86
+ class MelDataset(torch.utils.data.Dataset):
87
+ def __init__(self, training_files, segment_size, n_fft, num_mels,
88
+ hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
89
+ device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
90
+ self.audio_files = training_files
91
+ random.seed(1234)
92
+ if shuffle:
93
+ random.shuffle(self.audio_files)
94
+ self.segment_size = segment_size
95
+ self.sampling_rate = sampling_rate
96
+ self.split = split
97
+ self.n_fft = n_fft
98
+ self.num_mels = num_mels
99
+ self.hop_size = hop_size
100
+ self.win_size = win_size
101
+ self.fmin = fmin
102
+ self.fmax = fmax
103
+ self.fmax_loss = fmax_loss
104
+ self.cached_wav = None
105
+ self.n_cache_reuse = n_cache_reuse
106
+ self._cache_ref_count = 0
107
+ self.device = device
108
+ self.fine_tuning = fine_tuning
109
+ self.base_mels_path = base_mels_path
110
+
111
+ def __getitem__(self, index):
112
+ filename = self.audio_files[index]
113
+ if self._cache_ref_count == 0:
114
+ audio, sampling_rate = load_wav(filename)
115
+ audio = audio / MAX_WAV_VALUE
116
+ if not self.fine_tuning:
117
+ audio = normalize(audio) * 0.95
118
+ self.cached_wav = audio
119
+ if sampling_rate != self.sampling_rate:
120
+ raise ValueError("{} SR doesn't match target {} SR".format(
121
+ sampling_rate, self.sampling_rate))
122
+ self._cache_ref_count = self.n_cache_reuse
123
+ else:
124
+ audio = self.cached_wav
125
+ self._cache_ref_count -= 1
126
+
127
+ audio = torch.FloatTensor(audio)
128
+ audio = audio.unsqueeze(0)
129
+
130
+ if not self.fine_tuning:
131
+ if self.split:
132
+ if audio.size(1) >= self.segment_size:
133
+ max_audio_start = audio.size(1) - self.segment_size
134
+ audio_start = random.randint(0, max_audio_start)
135
+ audio = audio[:, audio_start:audio_start+self.segment_size]
136
+ else:
137
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
138
+
139
+ mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
140
+ self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
141
+ center=False)
142
+ else:
143
+ mel = np.load(
144
+ os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
145
+ mel = torch.from_numpy(mel)
146
+
147
+ if len(mel.shape) < 3:
148
+ mel = mel.unsqueeze(0)
149
+
150
+ if self.split:
151
+ frames_per_seg = math.ceil(self.segment_size / self.hop_size)
152
+
153
+ if audio.size(1) >= self.segment_size:
154
+ mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
155
+ mel = mel[:, :, mel_start:mel_start + frames_per_seg]
156
+ audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
157
+ else:
158
+ mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
159
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
160
+
161
+ mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
162
+ self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
163
+ center=False)
164
+
165
+ return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
166
+
167
+ def __len__(self):
168
+ return len(self.audio_files)
hifigan/models.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torch.nn as nn
4
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+ from utils import init_weights, get_padding
7
+
8
+ LRELU_SLOPE = 0.1
9
+
10
+
11
+ class ResBlock1(torch.nn.Module):
12
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
13
+ super(ResBlock1, self).__init__()
14
+ self.h = h
15
+ self.convs1 = nn.ModuleList([
16
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
17
+ padding=get_padding(kernel_size, dilation[0]))),
18
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
19
+ padding=get_padding(kernel_size, dilation[1]))),
20
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
21
+ padding=get_padding(kernel_size, dilation[2])))
22
+ ])
23
+ self.convs1.apply(init_weights)
24
+
25
+ self.convs2 = nn.ModuleList([
26
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
27
+ padding=get_padding(kernel_size, 1))),
28
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
29
+ padding=get_padding(kernel_size, 1))),
30
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
31
+ padding=get_padding(kernel_size, 1)))
32
+ ])
33
+ self.convs2.apply(init_weights)
34
+
35
+ def forward(self, x):
36
+ for c1, c2 in zip(self.convs1, self.convs2):
37
+ xt = F.leaky_relu(x, LRELU_SLOPE)
38
+ xt = c1(xt)
39
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
40
+ xt = c2(xt)
41
+ x = xt + x
42
+ return x
43
+
44
+ def remove_weight_norm(self):
45
+ for l in self.convs1:
46
+ remove_weight_norm(l)
47
+ for l in self.convs2:
48
+ remove_weight_norm(l)
49
+
50
+
51
+ class ResBlock2(torch.nn.Module):
52
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
53
+ super(ResBlock2, self).__init__()
54
+ self.h = h
55
+ self.convs = nn.ModuleList([
56
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
57
+ padding=get_padding(kernel_size, dilation[0]))),
58
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
59
+ padding=get_padding(kernel_size, dilation[1])))
60
+ ])
61
+ self.convs.apply(init_weights)
62
+
63
+ def forward(self, x):
64
+ for c in self.convs:
65
+ xt = F.leaky_relu(x, LRELU_SLOPE)
66
+ xt = c(xt)
67
+ x = xt + x
68
+ return x
69
+
70
+ def remove_weight_norm(self):
71
+ for l in self.convs:
72
+ remove_weight_norm(l)
73
+
74
+
75
+ class Generator(torch.nn.Module):
76
+ def __init__(self, h):
77
+ super(Generator, self).__init__()
78
+ self.h = h
79
+ self.num_kernels = len(h.resblock_kernel_sizes)
80
+ self.num_upsamples = len(h.upsample_rates)
81
+ self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
82
+ resblock = ResBlock1 if h.resblock == '1' else ResBlock2
83
+
84
+ self.ups = nn.ModuleList()
85
+ for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
86
+ self.ups.append(weight_norm(
87
+ ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
88
+ k, u, padding=(k-u)//2)))
89
+
90
+ self.resblocks = nn.ModuleList()
91
+ for i in range(len(self.ups)):
92
+ ch = h.upsample_initial_channel//(2**(i+1))
93
+ for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
94
+ self.resblocks.append(resblock(h, ch, k, d))
95
+
96
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
97
+ self.ups.apply(init_weights)
98
+ self.conv_post.apply(init_weights)
99
+
100
+ def forward(self, x):
101
+ x = self.conv_pre(x)
102
+ for i in range(self.num_upsamples):
103
+ x = F.leaky_relu(x, LRELU_SLOPE)
104
+ x = self.ups[i](x)
105
+ xs = None
106
+ for j in range(self.num_kernels):
107
+ if xs is None:
108
+ xs = self.resblocks[i*self.num_kernels+j](x)
109
+ else:
110
+ xs += self.resblocks[i*self.num_kernels+j](x)
111
+ x = xs / self.num_kernels
112
+ x = F.leaky_relu(x)
113
+ x = self.conv_post(x)
114
+ x = torch.tanh(x)
115
+
116
+ return x
117
+
118
+ def remove_weight_norm(self):
119
+ print('Removing weight norm...')
120
+ for l in self.ups:
121
+ remove_weight_norm(l)
122
+ for l in self.resblocks:
123
+ l.remove_weight_norm()
124
+ remove_weight_norm(self.conv_pre)
125
+ remove_weight_norm(self.conv_post)
126
+
127
+
128
+ class DiscriminatorP(torch.nn.Module):
129
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
130
+ super(DiscriminatorP, self).__init__()
131
+ self.period = period
132
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
133
+ self.convs = nn.ModuleList([
134
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
135
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
136
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
137
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
138
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
139
+ ])
140
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
141
+
142
+ def forward(self, x):
143
+ fmap = []
144
+
145
+ # 1d to 2d
146
+ b, c, t = x.shape
147
+ if t % self.period != 0: # pad first
148
+ n_pad = self.period - (t % self.period)
149
+ x = F.pad(x, (0, n_pad), "reflect")
150
+ t = t + n_pad
151
+ x = x.view(b, c, t // self.period, self.period)
152
+
153
+ for l in self.convs:
154
+ x = l(x)
155
+ x = F.leaky_relu(x, LRELU_SLOPE)
156
+ fmap.append(x)
157
+ x = self.conv_post(x)
158
+ fmap.append(x)
159
+ x = torch.flatten(x, 1, -1)
160
+
161
+ return x, fmap
162
+
163
+
164
+ class MultiPeriodDiscriminator(torch.nn.Module):
165
+ def __init__(self):
166
+ super(MultiPeriodDiscriminator, self).__init__()
167
+ self.discriminators = nn.ModuleList([
168
+ DiscriminatorP(2),
169
+ DiscriminatorP(3),
170
+ DiscriminatorP(5),
171
+ DiscriminatorP(7),
172
+ DiscriminatorP(11),
173
+ ])
174
+
175
+ def forward(self, y, y_hat):
176
+ y_d_rs = []
177
+ y_d_gs = []
178
+ fmap_rs = []
179
+ fmap_gs = []
180
+ for i, d in enumerate(self.discriminators):
181
+ y_d_r, fmap_r = d(y)
182
+ y_d_g, fmap_g = d(y_hat)
183
+ y_d_rs.append(y_d_r)
184
+ fmap_rs.append(fmap_r)
185
+ y_d_gs.append(y_d_g)
186
+ fmap_gs.append(fmap_g)
187
+
188
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
189
+
190
+
191
+ class DiscriminatorS(torch.nn.Module):
192
+ def __init__(self, use_spectral_norm=False):
193
+ super(DiscriminatorS, self).__init__()
194
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
195
+ self.convs = nn.ModuleList([
196
+ norm_f(Conv1d(1, 128, 15, 1, padding=7)),
197
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
198
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
199
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
200
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
201
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
202
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
203
+ ])
204
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
205
+
206
+ def forward(self, x):
207
+ fmap = []
208
+ for l in self.convs:
209
+ x = l(x)
210
+ x = F.leaky_relu(x, LRELU_SLOPE)
211
+ fmap.append(x)
212
+ x = self.conv_post(x)
213
+ fmap.append(x)
214
+ x = torch.flatten(x, 1, -1)
215
+
216
+ return x, fmap
217
+
218
+
219
+ class MultiScaleDiscriminator(torch.nn.Module):
220
+ def __init__(self):
221
+ super(MultiScaleDiscriminator, self).__init__()
222
+ self.discriminators = nn.ModuleList([
223
+ DiscriminatorS(use_spectral_norm=True),
224
+ DiscriminatorS(),
225
+ DiscriminatorS(),
226
+ ])
227
+ self.meanpools = nn.ModuleList([
228
+ AvgPool1d(4, 2, padding=2),
229
+ AvgPool1d(4, 2, padding=2)
230
+ ])
231
+
232
+ def forward(self, y, y_hat):
233
+ y_d_rs = []
234
+ y_d_gs = []
235
+ fmap_rs = []
236
+ fmap_gs = []
237
+ for i, d in enumerate(self.discriminators):
238
+ if i != 0:
239
+ y = self.meanpools[i-1](y)
240
+ y_hat = self.meanpools[i-1](y_hat)
241
+ y_d_r, fmap_r = d(y)
242
+ y_d_g, fmap_g = d(y_hat)
243
+ y_d_rs.append(y_d_r)
244
+ fmap_rs.append(fmap_r)
245
+ y_d_gs.append(y_d_g)
246
+ fmap_gs.append(fmap_g)
247
+
248
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
249
+
250
+
251
+ def feature_loss(fmap_r, fmap_g):
252
+ loss = 0
253
+ for dr, dg in zip(fmap_r, fmap_g):
254
+ for rl, gl in zip(dr, dg):
255
+ loss += torch.mean(torch.abs(rl - gl))
256
+
257
+ return loss*2
258
+
259
+
260
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
261
+ loss = 0
262
+ r_losses = []
263
+ g_losses = []
264
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
265
+ r_loss = torch.mean((1-dr)**2)
266
+ g_loss = torch.mean(dg**2)
267
+ loss += (r_loss + g_loss)
268
+ r_losses.append(r_loss.item())
269
+ g_losses.append(g_loss.item())
270
+
271
+ return loss, r_losses, g_losses
272
+
273
+
274
+ def generator_loss(disc_outputs):
275
+ loss = 0
276
+ gen_losses = []
277
+ for dg in disc_outputs:
278
+ l = torch.mean((1-dg)**2)
279
+ gen_losses.append(l)
280
+ loss += l
281
+
282
+ return loss, gen_losses
283
+
hifigan/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch==1.4.0
2
+ numpy==1.17.4
3
+ librosa==0.7.2
4
+ scipy==1.4.1
5
+ tensorboard==2.0
6
+ soundfile==0.10.3.post1
7
+ matplotlib==3.1.3
hifigan/test_fs2_speed.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from espnet2.bin.tts_inference import Text2Speech
2
+ import time
3
+
4
+ for device in ("cpu", "cuda"):
5
+ print(f"loading model in {device}")
6
+ text2speech = Text2Speech(train_config="/speech/arun/tts/hifigan/config.yaml",model_file="/var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/train.loss.ave.pth",device=device)
7
+ for i in range(5):
8
+ print("Run ",i)
9
+ st = time.time()
10
+ out = text2speech("EटA sटarakcars औr Elgoridam par pAठyakram par pahlE wyAखyAn mEq")
11
+ et = time.time()
12
+ elapsed = (et-st)
13
+ print("Elapsed time:", elapsed)
14
+ print("-----------------------------")
hifigan/test_hifigan_speed.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models import Generator
2
+ from scipy.io.wavfile import write
3
+ from meldataset import MAX_WAV_VALUE
4
+ import numpy as np
5
+ import os
6
+ import json
7
+ from env import AttrDict
8
+ import torch
9
+ import time
10
+
11
+ for dev in ("cpu", "cuda"):
12
+ print(f"loading model in {dev}")
13
+ device=torch.device(dev)
14
+ y1 = torch.load("/speech/arun/tts/hifigan/denorm/test_243.npy.pt", map_location=device)
15
+ y2 = torch.concat([y1]*5, dim=1)
16
+ y3 = torch.concat([y1]*10, dim=1)
17
+
18
+ config_file = os.path.join('/speech/arun/tts/hifigan/cp_hifigan/config.json')
19
+ with open(config_file) as f:
20
+ data = f.read()
21
+ json_config = json.loads(data)
22
+ h = AttrDict(json_config)
23
+ torch.manual_seed(h.seed)
24
+ generator = Generator(h).to(device)
25
+ state_dict_g = torch.load("/speech/arun/tts/hifigan/cp_hifigan/g_00120000", device)
26
+ generator.load_state_dict(state_dict_g['generator'])
27
+ generator.eval()
28
+ generator.remove_weight_norm()
29
+ for i in range(3):
30
+ print("Run ",i)
31
+ for x in [y1, y2, y3]:
32
+ with torch.no_grad():
33
+ st = time.time()
34
+ y_g_hat = generator(x)
35
+ audio = y_g_hat.squeeze()
36
+ audio = audio * MAX_WAV_VALUE
37
+ audio = audio.cpu().numpy().astype('int16')
38
+ output_file = "gen.wav"
39
+ write(output_file, h.sampling_rate, audio)
40
+ et = time.time()
41
+ elapsed = (et-st)
42
+ print("Elapsed time:", elapsed)
hifigan/test_tts_speed.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models import Generator
2
+ from scipy.io.wavfile import write
3
+ from meldataset import MAX_WAV_VALUE
4
+ import numpy as np
5
+ import os
6
+ import json
7
+ from env import AttrDict
8
+ import torch
9
+ import time
10
+ from espnet2.bin.tts_inference import Text2Speech
11
+
12
+ for dev in ("cpu", "cuda"):
13
+ print(f"loading model in {dev}")
14
+ device=torch.device(dev)
15
+ y1 = torch.load("/speech/arun/tts/hifigan/denorm/test_243.npy.pt", map_location=device)
16
+ y2 = torch.concat([y1]*5, dim=1)
17
+ y3 = torch.concat([y1]*10, dim=1)
18
+
19
+ config_file = os.path.join('/speech/arun/tts/hifigan/cp_hifigan/config.json')
20
+ with open(config_file) as f:
21
+ data = f.read()
22
+ json_config = json.loads(data)
23
+ h = AttrDict(json_config)
24
+ torch.manual_seed(h.seed)
25
+ generator = Generator(h).to(device)
26
+ state_dict_g = torch.load("/speech/arun/tts/hifigan/cp_hifigan/g_00120000", device)
27
+ generator.load_state_dict(state_dict_g['generator'])
28
+ generator.eval()
29
+ generator.remove_weight_norm()
30
+ text2speech = Text2Speech(train_config="/speech/arun/tts/hifigan/config.yaml",model_file="/var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/train.loss.ave.pth",device=dev)
31
+ for i in range(3):
32
+ print("Run ",i)
33
+ with torch.no_grad():
34
+ st = time.time()
35
+ out = text2speech("EटA sटarakcars औr Elgoridam par pAठyakram par pahlE wyAखyAn mEq")
36
+ x = out["feat_gen_denorm"].T.unsqueeze(0).to(device)
37
+ y_g_hat = generator(x)
38
+ audio = y_g_hat.squeeze()
39
+ audio = audio * MAX_WAV_VALUE
40
+ audio = audio.cpu().numpy().astype('int16')
41
+ output_file = "gen.wav"
42
+ write(output_file, h.sampling_rate, audio)
43
+ et = time.time()
44
+ elapsed = (et-st)
45
+ print("Elapsed time:", elapsed)