utkarsh2299
commited on
Upload 131 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- gujarati/female/model/config.yaml +278 -0
- gujarati/female/model/energy_stats.npz +3 -0
- gujarati/female/model/feats_stats.npz +3 -0
- gujarati/female/model/feats_type +1 -0
- gujarati/female/model/model.pth +3 -0
- gujarati/female/model/pitch_stats.npz +3 -0
- gujarati/male/model/config.yaml +276 -0
- gujarati/male/model/energy_stats.npz +3 -0
- gujarati/male/model/feats_stats.npz +3 -0
- gujarati/male/model/feats_type +1 -0
- gujarati/male/model/model.pth +3 -0
- gujarati/male/model/pitch_stats.npz +3 -0
- hifigan/LICENSE +21 -0
- hifigan/LJSpeech-1.1/training.txt +0 -0
- hifigan/LJSpeech-1.1/validation.txt +150 -0
- hifigan/README.md +105 -0
- hifigan/__init__.py +0 -0
- hifigan/__pycache__/__init__.cpython-37.pyc +0 -0
- hifigan/__pycache__/env.cpython-311.pyc +0 -0
- hifigan/__pycache__/env.cpython-37.pyc +0 -0
- hifigan/__pycache__/env.cpython-39.pyc +0 -0
- hifigan/__pycache__/meldataset.cpython-311.pyc +0 -0
- hifigan/__pycache__/meldataset.cpython-37.pyc +0 -0
- hifigan/__pycache__/meldataset.cpython-38.pyc +0 -0
- hifigan/__pycache__/meldataset.cpython-39.pyc +0 -0
- hifigan/__pycache__/models.cpython-311.pyc +0 -0
- hifigan/__pycache__/models.cpython-37.pyc +0 -0
- hifigan/__pycache__/models.cpython-39.pyc +0 -0
- hifigan/__pycache__/utils.cpython-311.pyc +0 -0
- hifigan/__pycache__/utils.cpython-37.pyc +0 -0
- hifigan/__pycache__/utils.cpython-39.pyc +0 -0
- hifigan/config.yaml +270 -0
- hifigan/config_v1.json +37 -0
- hifigan/config_v2.json +37 -0
- hifigan/config_v3.json +37 -0
- hifigan/denorm/test_243.npy.pt +3 -0
- hifigan/env.py +15 -0
- hifigan/fs2_speed.txt +24 -0
- hifigan/gen.wav +0 -0
- hifigan/griffin.wav +0 -0
- hifigan/hifigan_speed.txt +28 -0
- hifigan/inference.py +95 -0
- hifigan/inference_e2e.py +90 -0
- hifigan/inference_from_espnet.py +124 -0
- hifigan/meldataset.py +168 -0
- hifigan/models.py +283 -0
- hifigan/requirements.txt +7 -0
- hifigan/test_fs2_speed.py +14 -0
- hifigan/test_hifigan_speed.py +42 -0
- hifigan/test_tts_speed.py +45 -0
gujarati/female/model/config.yaml
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/tuning/train_fastspeech2.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: exp/tts_train_fastspeech2_raw_char_None
|
7 |
+
ngpu: 1
|
8 |
+
seed: 0
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: 2
|
14 |
+
dist_rank: 0
|
15 |
+
local_rank: 0
|
16 |
+
dist_master_addr: localhost
|
17 |
+
dist_master_port: 35609
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: true
|
20 |
+
unused_parameters: false
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: true
|
25 |
+
collect_stats: false
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 1000
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - valid
|
38 |
+
- loss
|
39 |
+
- min
|
40 |
+
- - train
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
keep_nbest_models: 5
|
44 |
+
nbest_averaging_interval: 0
|
45 |
+
grad_clip: 1.0
|
46 |
+
grad_clip_type: 2.0
|
47 |
+
grad_noise: false
|
48 |
+
accum_grad: 8
|
49 |
+
no_forward_run: false
|
50 |
+
resume: true
|
51 |
+
train_dtype: float32
|
52 |
+
use_amp: false
|
53 |
+
log_interval: null
|
54 |
+
use_matplotlib: true
|
55 |
+
use_tensorboard: true
|
56 |
+
use_wandb: false
|
57 |
+
wandb_project: null
|
58 |
+
wandb_id: null
|
59 |
+
wandb_entity: null
|
60 |
+
wandb_name: null
|
61 |
+
wandb_model_log_interval: -1
|
62 |
+
detect_anomaly: false
|
63 |
+
pretrain_path: null
|
64 |
+
init_param: []
|
65 |
+
ignore_init_mismatch: false
|
66 |
+
freeze_param: []
|
67 |
+
num_iters_per_epoch: 800
|
68 |
+
batch_size: 20
|
69 |
+
valid_batch_size: null
|
70 |
+
batch_bins: 3000000
|
71 |
+
valid_batch_bins: null
|
72 |
+
train_shape_file:
|
73 |
+
- exp/tts_stats_raw_char_None/train/text_shape.char
|
74 |
+
- exp/tts_stats_raw_char_None/train/speech_shape
|
75 |
+
valid_shape_file:
|
76 |
+
- exp/tts_stats_raw_char_None/valid/text_shape.char
|
77 |
+
- exp/tts_stats_raw_char_None/valid/speech_shape
|
78 |
+
batch_type: numel
|
79 |
+
valid_batch_type: null
|
80 |
+
fold_length:
|
81 |
+
- 150
|
82 |
+
- 204800
|
83 |
+
sort_in_batch: descending
|
84 |
+
sort_batch: descending
|
85 |
+
multiple_iterator: false
|
86 |
+
chunk_length: 500
|
87 |
+
chunk_shift_ratio: 0.5
|
88 |
+
num_cache_chunks: 1024
|
89 |
+
train_data_path_and_name_and_type:
|
90 |
+
- - dump/raw/tr_no_dev/text
|
91 |
+
- text
|
92 |
+
- text
|
93 |
+
- - duration_info/tr_no_dev/durations
|
94 |
+
- durations
|
95 |
+
- text_int
|
96 |
+
- - dump/raw/tr_no_dev/wav.scp
|
97 |
+
- speech
|
98 |
+
- sound
|
99 |
+
- - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
|
100 |
+
- pitch
|
101 |
+
- npy
|
102 |
+
- - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
|
103 |
+
- energy
|
104 |
+
- npy
|
105 |
+
valid_data_path_and_name_and_type:
|
106 |
+
- - dump/raw/dev/text
|
107 |
+
- text
|
108 |
+
- text
|
109 |
+
- - duration_info/dev/durations
|
110 |
+
- durations
|
111 |
+
- text_int
|
112 |
+
- - dump/raw/dev/wav.scp
|
113 |
+
- speech
|
114 |
+
- sound
|
115 |
+
- - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
|
116 |
+
- pitch
|
117 |
+
- npy
|
118 |
+
- - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
|
119 |
+
- energy
|
120 |
+
- npy
|
121 |
+
allow_variable_data_keys: false
|
122 |
+
max_cache_size: 0.0
|
123 |
+
max_cache_fd: 32
|
124 |
+
valid_max_cache_size: null
|
125 |
+
optim: adam
|
126 |
+
optim_conf:
|
127 |
+
lr: 1.0
|
128 |
+
scheduler: noamlr
|
129 |
+
scheduler_conf:
|
130 |
+
model_size: 384
|
131 |
+
warmup_steps: 4000
|
132 |
+
token_list:
|
133 |
+
- <blank>
|
134 |
+
- <unk>
|
135 |
+
- a
|
136 |
+
- A
|
137 |
+
- ','
|
138 |
+
- E
|
139 |
+
- r
|
140 |
+
- n
|
141 |
+
- I
|
142 |
+
- k
|
143 |
+
- o
|
144 |
+
- t
|
145 |
+
- m
|
146 |
+
- q
|
147 |
+
- w
|
148 |
+
- s
|
149 |
+
- p
|
150 |
+
- i
|
151 |
+
- y
|
152 |
+
- u
|
153 |
+
- l
|
154 |
+
- j
|
155 |
+
- h
|
156 |
+
- ट
|
157 |
+
- g
|
158 |
+
- d
|
159 |
+
- b
|
160 |
+
- $
|
161 |
+
- .
|
162 |
+
- श
|
163 |
+
- ड
|
164 |
+
- थ
|
165 |
+
- C
|
166 |
+
- ण
|
167 |
+
- c
|
168 |
+
- U
|
169 |
+
- ध
|
170 |
+
- B
|
171 |
+
- ख
|
172 |
+
- ള
|
173 |
+
- P
|
174 |
+
- ष
|
175 |
+
- J
|
176 |
+
- घ
|
177 |
+
- ठ
|
178 |
+
- R
|
179 |
+
- ऐ
|
180 |
+
- औ
|
181 |
+
- ढ
|
182 |
+
- ञ
|
183 |
+
- H
|
184 |
+
- ऑ
|
185 |
+
- ऍ
|
186 |
+
- M
|
187 |
+
- ॠ
|
188 |
+
- <sos/eos>
|
189 |
+
odim: null
|
190 |
+
model_conf: {}
|
191 |
+
use_preprocessor: true
|
192 |
+
token_type: char
|
193 |
+
bpemodel: null
|
194 |
+
non_linguistic_symbols: null
|
195 |
+
cleaner: null
|
196 |
+
g2p: g2p_en_no_space
|
197 |
+
feats_extract: fbank
|
198 |
+
feats_extract_conf:
|
199 |
+
n_fft: 1024
|
200 |
+
hop_length: 256
|
201 |
+
win_length: 1024
|
202 |
+
fs: 22050
|
203 |
+
fmin: 0
|
204 |
+
fmax: 8000
|
205 |
+
n_mels: 80
|
206 |
+
normalize: global_mvn
|
207 |
+
normalize_conf:
|
208 |
+
stats_file: /speech/arun/released_models/tts/female/gujarati/fastspeech2_hs/feats_stats.npz
|
209 |
+
tts: fastspeech2
|
210 |
+
tts_conf:
|
211 |
+
adim: 384
|
212 |
+
aheads: 2
|
213 |
+
elayers: 4
|
214 |
+
eunits: 1536
|
215 |
+
dlayers: 4
|
216 |
+
dunits: 1536
|
217 |
+
positionwise_layer_type: conv1d
|
218 |
+
positionwise_conv_kernel_size: 3
|
219 |
+
duration_predictor_layers: 2
|
220 |
+
duration_predictor_chans: 256
|
221 |
+
duration_predictor_kernel_size: 3
|
222 |
+
postnet_layers: 5
|
223 |
+
postnet_filts: 5
|
224 |
+
postnet_chans: 256
|
225 |
+
use_masking: true
|
226 |
+
use_scaled_pos_enc: true
|
227 |
+
encoder_normalize_before: true
|
228 |
+
decoder_normalize_before: true
|
229 |
+
reduction_factor: 1
|
230 |
+
init_type: xavier_uniform
|
231 |
+
init_enc_alpha: 1.0
|
232 |
+
init_dec_alpha: 1.0
|
233 |
+
transformer_enc_dropout_rate: 0.2
|
234 |
+
transformer_enc_positional_dropout_rate: 0.2
|
235 |
+
transformer_enc_attn_dropout_rate: 0.2
|
236 |
+
transformer_dec_dropout_rate: 0.2
|
237 |
+
transformer_dec_positional_dropout_rate: 0.2
|
238 |
+
transformer_dec_attn_dropout_rate: 0.2
|
239 |
+
pitch_predictor_layers: 5
|
240 |
+
pitch_predictor_chans: 256
|
241 |
+
pitch_predictor_kernel_size: 5
|
242 |
+
pitch_predictor_dropout: 0.5
|
243 |
+
pitch_embed_kernel_size: 1
|
244 |
+
pitch_embed_dropout: 0.0
|
245 |
+
stop_gradient_from_pitch_predictor: true
|
246 |
+
energy_predictor_layers: 2
|
247 |
+
energy_predictor_chans: 256
|
248 |
+
energy_predictor_kernel_size: 3
|
249 |
+
energy_predictor_dropout: 0.5
|
250 |
+
energy_embed_kernel_size: 1
|
251 |
+
energy_embed_dropout: 0.0
|
252 |
+
stop_gradient_from_energy_predictor: false
|
253 |
+
pitch_extract: dio
|
254 |
+
pitch_extract_conf:
|
255 |
+
fs: 22050
|
256 |
+
n_fft: 1024
|
257 |
+
hop_length: 256
|
258 |
+
f0max: 400
|
259 |
+
f0min: 80
|
260 |
+
reduction_factor: 1
|
261 |
+
pitch_normalize: global_mvn
|
262 |
+
pitch_normalize_conf:
|
263 |
+
stats_file: /speech/arun/released_models/tts/female/gujarati/fastspeech2_hs/pitch_stats.npz
|
264 |
+
energy_extract: energy
|
265 |
+
energy_extract_conf:
|
266 |
+
fs: 22050
|
267 |
+
n_fft: 1024
|
268 |
+
hop_length: 256
|
269 |
+
win_length: 1024
|
270 |
+
reduction_factor: 1
|
271 |
+
energy_normalize: global_mvn
|
272 |
+
energy_normalize_conf:
|
273 |
+
stats_file: /speech/arun/released_models/tts/female/gujarati/fastspeech2_hs/energy_stats.npz
|
274 |
+
required:
|
275 |
+
- output_dir
|
276 |
+
- token_list
|
277 |
+
version: 0.10.7a1
|
278 |
+
distributed: true
|
gujarati/female/model/energy_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a4478a7089410d1efee3f85d49d2c54f6f10f832917843627e8592d92701d15
|
3 |
+
size 770
|
gujarati/female/model/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3d0136072e6aacf0744418f56b08ff518c410aaa3e58676542b923e91d3d21e
|
3 |
+
size 1402
|
gujarati/female/model/feats_type
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
raw
|
gujarati/female/model/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3aed9a0cdb4c0d5952fce48a389a22cd7ab693424b0ebc374ed1504485c5771a
|
3 |
+
size 148688073
|
gujarati/female/model/pitch_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d847be75d1ca0f0c0f12f2a58f8481f15d3d91169c2249cc7f8cb0fb21725a76
|
3 |
+
size 770
|
gujarati/male/model/config.yaml
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/tuning/train_fastspeech2.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: exp/tts_train_fastspeech2_raw_char_None
|
7 |
+
ngpu: 1
|
8 |
+
seed: 0
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: 4
|
14 |
+
dist_rank: 0
|
15 |
+
local_rank: 0
|
16 |
+
dist_master_addr: localhost
|
17 |
+
dist_master_port: 32867
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: true
|
20 |
+
unused_parameters: false
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: true
|
25 |
+
collect_stats: false
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 1000
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - valid
|
38 |
+
- loss
|
39 |
+
- min
|
40 |
+
- - train
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
keep_nbest_models: 5
|
44 |
+
grad_clip: 1.0
|
45 |
+
grad_clip_type: 2.0
|
46 |
+
grad_noise: false
|
47 |
+
accum_grad: 8
|
48 |
+
no_forward_run: false
|
49 |
+
resume: true
|
50 |
+
train_dtype: float32
|
51 |
+
use_amp: false
|
52 |
+
log_interval: null
|
53 |
+
use_tensorboard: true
|
54 |
+
use_wandb: false
|
55 |
+
wandb_project: null
|
56 |
+
wandb_id: null
|
57 |
+
wandb_entity: null
|
58 |
+
wandb_name: null
|
59 |
+
wandb_model_log_interval: -1
|
60 |
+
detect_anomaly: false
|
61 |
+
pretrain_path: null
|
62 |
+
init_param: []
|
63 |
+
ignore_init_mismatch: false
|
64 |
+
freeze_param: []
|
65 |
+
num_iters_per_epoch: 800
|
66 |
+
batch_size: 20
|
67 |
+
valid_batch_size: null
|
68 |
+
batch_bins: 3000000
|
69 |
+
valid_batch_bins: null
|
70 |
+
train_shape_file:
|
71 |
+
- exp/tts_stats_raw_char_None/train/text_shape.char
|
72 |
+
- exp/tts_stats_raw_char_None/train/speech_shape
|
73 |
+
valid_shape_file:
|
74 |
+
- exp/tts_stats_raw_char_None/valid/text_shape.char
|
75 |
+
- exp/tts_stats_raw_char_None/valid/speech_shape
|
76 |
+
batch_type: numel
|
77 |
+
valid_batch_type: null
|
78 |
+
fold_length:
|
79 |
+
- 150
|
80 |
+
- 204800
|
81 |
+
sort_in_batch: descending
|
82 |
+
sort_batch: descending
|
83 |
+
multiple_iterator: false
|
84 |
+
chunk_length: 500
|
85 |
+
chunk_shift_ratio: 0.5
|
86 |
+
num_cache_chunks: 1024
|
87 |
+
train_data_path_and_name_and_type:
|
88 |
+
- - dump/raw/tr_no_dev/text
|
89 |
+
- text
|
90 |
+
- text
|
91 |
+
- - duration_info/tr_no_dev/durations
|
92 |
+
- durations
|
93 |
+
- text_int
|
94 |
+
- - dump/raw/tr_no_dev/wav.scp
|
95 |
+
- speech
|
96 |
+
- sound
|
97 |
+
- - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
|
98 |
+
- pitch
|
99 |
+
- npy
|
100 |
+
- - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
|
101 |
+
- energy
|
102 |
+
- npy
|
103 |
+
valid_data_path_and_name_and_type:
|
104 |
+
- - dump/raw/dev/text
|
105 |
+
- text
|
106 |
+
- text
|
107 |
+
- - duration_info/dev/durations
|
108 |
+
- durations
|
109 |
+
- text_int
|
110 |
+
- - dump/raw/dev/wav.scp
|
111 |
+
- speech
|
112 |
+
- sound
|
113 |
+
- - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
|
114 |
+
- pitch
|
115 |
+
- npy
|
116 |
+
- - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
|
117 |
+
- energy
|
118 |
+
- npy
|
119 |
+
allow_variable_data_keys: false
|
120 |
+
max_cache_size: 0.0
|
121 |
+
max_cache_fd: 32
|
122 |
+
valid_max_cache_size: null
|
123 |
+
optim: adam
|
124 |
+
optim_conf:
|
125 |
+
lr: 1.0
|
126 |
+
scheduler: noamlr
|
127 |
+
scheduler_conf:
|
128 |
+
model_size: 384
|
129 |
+
warmup_steps: 4000
|
130 |
+
token_list:
|
131 |
+
- <blank>
|
132 |
+
- <unk>
|
133 |
+
- a
|
134 |
+
- A
|
135 |
+
- E
|
136 |
+
- ','
|
137 |
+
- r
|
138 |
+
- n
|
139 |
+
- I
|
140 |
+
- k
|
141 |
+
- o
|
142 |
+
- t
|
143 |
+
- m
|
144 |
+
- q
|
145 |
+
- w
|
146 |
+
- s
|
147 |
+
- p
|
148 |
+
- i
|
149 |
+
- y
|
150 |
+
- u
|
151 |
+
- l
|
152 |
+
- j
|
153 |
+
- h
|
154 |
+
- ट
|
155 |
+
- g
|
156 |
+
- d
|
157 |
+
- $
|
158 |
+
- .
|
159 |
+
- b
|
160 |
+
- श
|
161 |
+
- थ
|
162 |
+
- ड
|
163 |
+
- C
|
164 |
+
- ण
|
165 |
+
- c
|
166 |
+
- U
|
167 |
+
- ध
|
168 |
+
- B
|
169 |
+
- ख
|
170 |
+
- ള
|
171 |
+
- ष
|
172 |
+
- P
|
173 |
+
- घ
|
174 |
+
- J
|
175 |
+
- ठ
|
176 |
+
- R
|
177 |
+
- ऐ
|
178 |
+
- औ
|
179 |
+
- ढ
|
180 |
+
- ञ
|
181 |
+
- H
|
182 |
+
- ऑ
|
183 |
+
- ऍ
|
184 |
+
- M
|
185 |
+
- ॠ
|
186 |
+
- <sos/eos>
|
187 |
+
odim: null
|
188 |
+
model_conf: {}
|
189 |
+
use_preprocessor: true
|
190 |
+
token_type: char
|
191 |
+
bpemodel: null
|
192 |
+
non_linguistic_symbols: null
|
193 |
+
cleaner: null
|
194 |
+
g2p: g2p_en_no_space
|
195 |
+
feats_extract: fbank
|
196 |
+
feats_extract_conf:
|
197 |
+
n_fft: 1024
|
198 |
+
hop_length: 256
|
199 |
+
win_length: 1024
|
200 |
+
fs: 22050
|
201 |
+
fmin: 0
|
202 |
+
fmax: 8000
|
203 |
+
n_mels: 80
|
204 |
+
normalize: global_mvn
|
205 |
+
normalize_conf:
|
206 |
+
stats_file: /speech/arun/released_models/tts/male/gujarati/fastspeech2_hs/feats_stats.npz
|
207 |
+
tts: fastspeech2
|
208 |
+
tts_conf:
|
209 |
+
adim: 384
|
210 |
+
aheads: 2
|
211 |
+
elayers: 4
|
212 |
+
eunits: 1536
|
213 |
+
dlayers: 4
|
214 |
+
dunits: 1536
|
215 |
+
positionwise_layer_type: conv1d
|
216 |
+
positionwise_conv_kernel_size: 3
|
217 |
+
duration_predictor_layers: 2
|
218 |
+
duration_predictor_chans: 256
|
219 |
+
duration_predictor_kernel_size: 3
|
220 |
+
postnet_layers: 5
|
221 |
+
postnet_filts: 5
|
222 |
+
postnet_chans: 256
|
223 |
+
use_masking: true
|
224 |
+
use_scaled_pos_enc: true
|
225 |
+
encoder_normalize_before: true
|
226 |
+
decoder_normalize_before: true
|
227 |
+
reduction_factor: 1
|
228 |
+
init_type: xavier_uniform
|
229 |
+
init_enc_alpha: 1.0
|
230 |
+
init_dec_alpha: 1.0
|
231 |
+
transformer_enc_dropout_rate: 0.2
|
232 |
+
transformer_enc_positional_dropout_rate: 0.2
|
233 |
+
transformer_enc_attn_dropout_rate: 0.2
|
234 |
+
transformer_dec_dropout_rate: 0.2
|
235 |
+
transformer_dec_positional_dropout_rate: 0.2
|
236 |
+
transformer_dec_attn_dropout_rate: 0.2
|
237 |
+
pitch_predictor_layers: 5
|
238 |
+
pitch_predictor_chans: 256
|
239 |
+
pitch_predictor_kernel_size: 5
|
240 |
+
pitch_predictor_dropout: 0.5
|
241 |
+
pitch_embed_kernel_size: 1
|
242 |
+
pitch_embed_dropout: 0.0
|
243 |
+
stop_gradient_from_pitch_predictor: true
|
244 |
+
energy_predictor_layers: 2
|
245 |
+
energy_predictor_chans: 256
|
246 |
+
energy_predictor_kernel_size: 3
|
247 |
+
energy_predictor_dropout: 0.5
|
248 |
+
energy_embed_kernel_size: 1
|
249 |
+
energy_embed_dropout: 0.0
|
250 |
+
stop_gradient_from_energy_predictor: false
|
251 |
+
pitch_extract: dio
|
252 |
+
pitch_extract_conf:
|
253 |
+
fs: 22050
|
254 |
+
n_fft: 1024
|
255 |
+
hop_length: 256
|
256 |
+
f0max: 350
|
257 |
+
f0min: 40
|
258 |
+
reduction_factor: 1
|
259 |
+
pitch_normalize: global_mvn
|
260 |
+
pitch_normalize_conf:
|
261 |
+
stats_file: /speech/arun/released_models/tts/male/gujarati/fastspeech2_hs/pitch_stats.npz
|
262 |
+
energy_extract: energy
|
263 |
+
energy_extract_conf:
|
264 |
+
fs: 22050
|
265 |
+
n_fft: 1024
|
266 |
+
hop_length: 256
|
267 |
+
win_length: 1024
|
268 |
+
reduction_factor: 1
|
269 |
+
energy_normalize: global_mvn
|
270 |
+
energy_normalize_conf:
|
271 |
+
stats_file: /speech/arun/released_models/tts/male/gujarati/fastspeech2_hs/energy_stats.npz
|
272 |
+
required:
|
273 |
+
- output_dir
|
274 |
+
- token_list
|
275 |
+
version: 0.10.3a3
|
276 |
+
distributed: true
|
gujarati/male/model/energy_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76c80d81162cb696809ef1de383612c18b2c593d8f633f2a40466adf7cbdde77
|
3 |
+
size 770
|
gujarati/male/model/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5c32a4e80ba2fc02765cd2e50a4b12b08c317ac3654c441564b982157121e95
|
3 |
+
size 1402
|
gujarati/male/model/feats_type
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
raw
|
gujarati/male/model/model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d65d3543fdbb027fbd8a71497ab5a672adbff5aa83e69a0265d478b65e72b719
|
3 |
+
size 148691959
|
gujarati/male/model/pitch_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f71113509921e3de8310812e1239bfe3427df0fd1a192a761d13aad1e902f867
|
3 |
+
size 770
|
hifigan/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 Jungil Kong
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
hifigan/LJSpeech-1.1/training.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
hifigan/LJSpeech-1.1/validation.txt
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LJ050-0269|The essential terms of such memoranda might well be embodied in an Executive order.|The essential terms of such memoranda might well be embodied in an Executive order.
|
2 |
+
LJ050-0270|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.|This Commission can recommend no procedures for the future protection of our Presidents which will guarantee security.
|
3 |
+
LJ050-0271|The demands on the President in the execution of His responsibilities in today's world are so varied and complex|The demands on the President in the execution of His responsibilities in today's world are so varied and complex
|
4 |
+
LJ050-0272|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.|and the traditions of the office in a democracy such as ours are so deep-seated as to preclude absolute security.
|
5 |
+
LJ050-0273|The Commission has, however, from its examination of the facts of President Kennedy's assassination|The Commission has, however, from its examination of the facts of President Kennedy's assassination
|
6 |
+
LJ050-0274|made certain recommendations which it believes would, if adopted,|made certain recommendations which it believes would, if adopted,
|
7 |
+
LJ050-0275|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.|materially improve upon the procedures in effect at the time of President Kennedy's assassination and result in a substantial lessening of the danger.
|
8 |
+
LJ050-0276|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,|As has been pointed out, the Commission has not resolved all the proposals which could be made. The Commission nevertheless is confident that,
|
9 |
+
LJ050-0277|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,|with the active cooperation of the responsible agencies and with the understanding of the people of the United States in their demands upon their President,
|
10 |
+
LJ050-0278|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.|the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.
|
11 |
+
LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.
|
12 |
+
LJ001-0068|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.|The characteristic Dutch type, as represented by the excellent printer Gerard Leew, is very pronounced and uncompromising Gothic.
|
13 |
+
LJ002-0149|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.|The latter indeed hung like millstones round the neck of the unhappy insolvent wretches who found themselves in limbo.
|
14 |
+
LJ002-0157|and Susannah Evans, in October the same year, for 2 shillings, with costs of 6 shillings, 8 pence.|and Susannah Evans, in October the same year, for two shillings, with costs of six shillings, eight pence.
|
15 |
+
LJ002-0167|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.|quotes a case which came within his own knowledge of a boy sent to prison for non-payment of one penny.
|
16 |
+
LJ003-0042|The completion of this very necessary building was, however, much delayed for want of funds,|The completion of this very necessary building was, however, much delayed for want of funds,
|
17 |
+
LJ003-0307|but as yet no suggestion was made to provide prison uniform.|but as yet no suggestion was made to provide prison uniform.
|
18 |
+
LJ004-0169|On the dirty bedstead lay a wretched being in the throes of severe illness.|On the dirty bedstead lay a wretched being in the throes of severe illness.
|
19 |
+
LJ004-0233|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.|Under the new rule visitors were not allowed to pass into the interior of the prison, but were detained between the grating.
|
20 |
+
LJ005-0101|whence it deduced the practice and condition of every prison that replied.|whence it deduced the practice and condition of every prison that replied.
|
21 |
+
LJ005-0108|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,|the prisoners, without firing, bedding, or sufficient food, spent their days "in surveying their grotesque prison,
|
22 |
+
LJ005-0202|An examination of this report shows how even the most insignificant township had its jail.|An examination of this report shows how even the most insignificant township had its jail.
|
23 |
+
LJ005-0234|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.|The visits of friends was once more unreservedly allowed, and these incomers freely brought in extra provisions and beer.
|
24 |
+
LJ005-0248|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.|and stated that in his opinion Newgate, as the common jail of Middlesex, was wholly inadequate to the proper confinement of its prisoners.
|
25 |
+
LJ006-0001|The Chronicles of Newgate, Volume 2. By Arthur Griffiths. Section 9: The first report of the inspector of prisons.|The Chronicles of Newgate, Volume two. By Arthur Griffiths. Section nine: The first report of the inspector of prisons.
|
26 |
+
LJ006-0018|One was Mr. William Crawford, the other the Rev. Whitworth Russell.|One was Mr. William Crawford, the other the Rev. Whitworth Russell.
|
27 |
+
LJ006-0034|They attended early and late; they mustered the prisoners, examined into their condition,|They attended early and late; they mustered the prisoners, examined into their condition,
|
28 |
+
LJ006-0078|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.|A new prisoner's fate, as to location, rested really with a powerful fellow-prisoner.
|
29 |
+
LJ007-0217|They go on to say|They go on to say
|
30 |
+
LJ007-0243|It was not till the erection of the new prison at Holloway in 1850, and the entire internal reconstruction of Newgate according to new ideas,|It was not till the erection of the new prison at Holloway in eighteen fifty, and the entire internal reconstruction of Newgate according to new ideas,
|
31 |
+
LJ008-0087|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.|The change from Tyburn to the Old Bailey had worked no improvement as regards the gathering together of the crowd or its demeanor.
|
32 |
+
LJ008-0131|the other he kept between his hands.|the other he kept between his hands.
|
33 |
+
LJ008-0140|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,|Whenever the public attention had been specially called to a particular crime, either on account of its atrocity,
|
34 |
+
LJ008-0158|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.|The pressure soon became so frightful that many would have willingly escaped from the crowd; but their attempts only increased the general confusion.
|
35 |
+
LJ008-0174|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.|One cart-load of spectators having broken down, some of its occupants fell off the vehicle, and were instantly trampled to death.
|
36 |
+
LJ010-0047|while in 1850 Her Majesty was the victim of another outrage at the hands of one Pate.|while in eighteen fifty Her Majesty was the victim of another outrage at the hands of one Pate.
|
37 |
+
LJ010-0061|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.|That some thirty or more needy men should hope to revolutionize England is a sufficient proof of the absurdity of their attempt.
|
38 |
+
LJ010-0105|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.|Thistlewood was discovered next morning in a mean house in White Street, Moorfields.
|
39 |
+
LJ010-0233|Here again probably it was partly the love of notoriety which was the incentive,|Here again probably it was partly the love of notoriety which was the incentive,
|
40 |
+
LJ010-0234|backed possibly with the hope that, as in a much more recent case,|backed possibly with the hope that, as in a much more recent case,
|
41 |
+
LJ010-0258|As the Queen was driving from Buckingham Palace to the Chapel Royal,|As the Queen was driving from Buckingham Palace to the Chapel Royal,
|
42 |
+
LJ010-0262|charged him with the offense.|charged him with the offense.
|
43 |
+
LJ010-0270|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.|exactly tallied with that of the deformed person "wanted" for the assault on the Queen.
|
44 |
+
LJ010-0293|I have already remarked that as violence was more and more eliminated from crimes against the person,|I have already remarked that as violence was more and more eliminated from crimes against the person,
|
45 |
+
LJ011-0009|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.|Nothing more was heard of the affair, although the lady declared that she had never instructed Fauntleroy to sell.
|
46 |
+
LJ011-0256|By this time the neighbors were aroused, and several people came to the scene of the affray.|By this time the neighbors were aroused, and several people came to the scene of the affray.
|
47 |
+
LJ012-0044|When his trade was busiest he set up a second establishment, at the head of which, although he was married,|When his trade was busiest he set up a second establishment, at the head of which, although he was married,
|
48 |
+
LJ012-0145|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.|Solomons was now also admitted as a witness, and his evidence, with that of Moss, secured the transportation of the principal actors in the theft.
|
49 |
+
LJ013-0020|he acted in a manner which excited the suspicions of the crew.|he acted in a manner which excited the suspicions of the crew.
|
50 |
+
LJ013-0077|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.|Barber and Fletcher were both transported for life, although Fletcher declared that Barber was innocent, and had no guilty knowledge of what was being done.
|
51 |
+
LJ013-0228|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.|In the pocket of the coat Mr. Cope, the governor, found a neatly-folded cloth, and asked what it was for.
|
52 |
+
LJ014-0020|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
|
53 |
+
LJ014-0054|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.|a maidservant, Sarah Thomas, murdered her mistress, an aged woman, by beating out her brains with a stone.
|
54 |
+
LJ014-0101|he found that it was soft and new, while elsewhere it was set and hard.|he found that it was soft and new, while elsewhere it was set and hard.
|
55 |
+
LJ014-0103|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.|beneath them was a layer of fresh mortar, beneath that a lot of loose earth, amongst which a stocking was turned up, and presently a human toe.
|
56 |
+
LJ014-0263|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
|
57 |
+
LJ014-0272|and 1850 to embezzle and apply to his own purposes some £71,000.|and eighteen fifty to embezzle and apply to his own purposes some seventy-one thousand pounds.
|
58 |
+
LJ014-0311|His extensive business had been carried on by fraud.|His extensive business had been carried on by fraud.
|
59 |
+
LJ015-0197|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.|which at one time spread terror throughout London. Thieves preferred now to use ingenuity rather than brute force.
|
60 |
+
LJ016-0089|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.|He was engaged in whitewashing and cleaning; the officer who had him in charge left him on the stairs leading to the gallery.
|
61 |
+
LJ016-0407|who generally attended the prison services.|who generally attended the prison services.
|
62 |
+
LJ016-0443|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.|He was promptly rescued from his perilous condition, but not before his face and hands were badly scorched.
|
63 |
+
LJ017-0033|a medical practitioner, charged with doing to death persons who relied upon his professional skill.|a medical practitioner, charged with doing to death persons who relied upon his professional skill.
|
64 |
+
LJ017-0038|That the administration of justice should never be interfered with by local prejudice or local feeling|That the administration of justice should never be interfered with by local prejudice or local feeling
|
65 |
+
LJ018-0018|he wore gold-rimmed eye-glasses and a gold watch and chain.|he wore gold-rimmed eye-glasses and a gold watch and chain.
|
66 |
+
LJ018-0119|His offer was not, however, accepted.|His offer was not, however, accepted.
|
67 |
+
LJ018-0280|The commercial experience of these clever rogues was cosmopolitan.|The commercial experience of these clever rogues was cosmopolitan.
|
68 |
+
LJ019-0178|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.|and abandoned because of the expense. As to the entire reconstruction of Newgate, nothing had been done as yet.
|
69 |
+
LJ019-0240|But no structural alterations were made from the date first quoted until the time of closing the prison in 1881.|But no structural alterations were made from the date first quoted until the time of closing the prison in eighteen eighty-one.
|
70 |
+
LJ021-0049|and the curtailment of rank stock speculation through the Securities Exchange Act.|and the curtailment of rank stock speculation through the Securities Exchange Act.
|
71 |
+
LJ021-0155|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.|both directly on the public works themselves, and indirectly in the industries supplying the materials for these public works.
|
72 |
+
LJ022-0046|It is true that while business and industry are definitely better our relief rolls are still too large.|It is true that while business and industry are definitely better our relief rolls are still too large.
|
73 |
+
LJ022-0173|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,|for the regulation of transportation by water, for the strengthening of our Merchant Marine and Air Transport,
|
74 |
+
LJ024-0087|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.|I have thus explained to you the reasons that lie behind our efforts to secure results by legislation within the Constitution.
|
75 |
+
LJ024-0110|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay|And the strategy of that last stand is to suggest the time-consuming process of amendment in order to kill off by delay
|
76 |
+
LJ024-0119|When before have you found them really at your side in your fights for progress?|When before have you found them really at your side in your fights for progress?
|
77 |
+
LJ025-0091|as it was current among contemporary chemists.|as it was current among contemporary chemists.
|
78 |
+
LJ026-0029|so in the case under discussion.|so in the case under discussion.
|
79 |
+
LJ026-0039|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.|the earliest organisms were protists and that from them animals and plants were evolved along divergent lines of descent.
|
80 |
+
LJ026-0064|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.|but unlike that of the animal, it is not chiefly an income of foods, but only of the raw materials of food.
|
81 |
+
LJ026-0105|This is done by diastase, an enzyme of plant cells.|This is done by diastase, an enzyme of plant cells.
|
82 |
+
LJ026-0137|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.|and be laid down as "reserve starch" in the cells of root or stem or elsewhere.
|
83 |
+
LJ027-0006|In all these lines the facts are drawn together by a strong thread of unity.|In all these lines the facts are drawn together by a strong thread of unity.
|
84 |
+
LJ028-0134|He also erected what is called a pensile paradise:|He also erected what is called a pensile paradise:
|
85 |
+
LJ028-0138|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,|perhaps the tales that travelers told him were exaggerated as travelers' tales are likely to be,
|
86 |
+
LJ028-0189|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.|The fall of Babylon with its lofty walls was a most important event in the history of the ancient world.
|
87 |
+
LJ028-0281|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,|Till mules foal ye shall not take our city, he thought, as he reflected on this speech, that Babylon might now be taken,
|
88 |
+
LJ029-0188|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.|Stevenson was jeered, jostled, and spat upon by hostile demonstrators outside the Dallas Memorial Auditorium Theater.
|
89 |
+
LJ030-0098|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,|The remainder of the motorcade consisted of five cars for other dignitaries, including the mayor of Dallas and Texas Congressmen,
|
90 |
+
LJ031-0007|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.|Chief of Police Curry and police motorcyclists at the head of the motorcade led the way to the hospital.
|
91 |
+
LJ031-0091|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.|You have to determine which things, which are immediately life threatening and cope with them, before attempting to evaluate the full extent of the injuries.
|
92 |
+
LJ031-0227|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,|The doctors traced the course of the bullet through the body and, as information was received from Parkland Hospital,
|
93 |
+
LJ032-0100|Marina Oswald|Marina Oswald
|
94 |
+
LJ032-0165|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.|to the exclusion of all others because there are not enough microscopic characteristics present in fibers.
|
95 |
+
LJ032-0198|During the period from March 2, 1963, to April 24, 1963,|During the period from March two, nineteen sixty-three, to April twenty-four, nineteen sixty-three,
|
96 |
+
LJ033-0046|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.|went out to the garage to paint some children's blocks, and worked in the garage for half an hour or so.
|
97 |
+
LJ033-0072|I then stepped off of it and the officer picked it up in the middle and it bent so.|I then stepped off of it and the officer picked it up in the middle and it bent so.
|
98 |
+
LJ033-0135|Location of Bag|Location of Bag
|
99 |
+
LJ034-0083|The significance of Givens' observation that Oswald was carrying his clipboard|The significance of Givens' observation that Oswald was carrying his clipboard
|
100 |
+
LJ034-0179|and, quote, seemed to be sitting a little forward, end quote,|and, quote, seemed to be sitting a little forward, end quote,
|
101 |
+
LJ035-0125|Victoria Adams, who worked on the fourth floor of the Depository Building,|Victoria Adams, who worked on the fourth floor of the Depository Building,
|
102 |
+
LJ035-0162|approximately 30 to 45 seconds after Oswald's lunchroom encounter with Baker and Truly.|approximately thirty to forty-five seconds after Oswald's lunchroom encounter with Baker and Truly.
|
103 |
+
LJ035-0189|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,|Special Agent Forrest V. Sorrels of the Secret Service, who had been in the motorcade,
|
104 |
+
LJ035-0208|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor|Oswald's known actions in the building immediately after the assassination are consistent with his having been at the southeast corner window of the sixth floor
|
105 |
+
LJ036-0216|Tippit got out and started to walk around the front of the car|Tippit got out and started to walk around the front of the car
|
106 |
+
LJ037-0093|William Arthur Smith was about a block east of 10th and Patton when he heard shots.|William Arthur Smith was about a block east of tenth and Patton when he heard shots.
|
107 |
+
LJ037-0157|taken from Oswald.|taken from Oswald.
|
108 |
+
LJ037-0178|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,|or one used Remington-Peters cartridge case, which may have been in the revolver before the shooting,
|
109 |
+
LJ037-0219|Oswald's Jacket|Oswald's Jacket
|
110 |
+
LJ037-0222|When Oswald was arrested, he did not have a jacket.|When Oswald was arrested, he did not have a jacket.
|
111 |
+
LJ038-0017|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.|Attracted by the sound of the sirens, Mrs. Postal stepped out of the box office and walked to the curb.
|
112 |
+
LJ038-0052|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.|testified regarding the arrest of Oswald, as did the various police officers who participated in the fight.
|
113 |
+
LJ038-0077|Statements of Oswald during Detention.|Statements of Oswald during Detention.
|
114 |
+
LJ038-0161|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.|and he asked me did I know which way he was coming, and I told him, yes, he probably come down Main and turn on Houston and then back again on Elm.
|
115 |
+
LJ038-0212|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.|which appeared to be the work of a man expecting to be killed, or imprisoned, or to disappear.
|
116 |
+
LJ039-0103|Oswald, like all Marine recruits, received training on the rifle range at distances up to 500 yards,|Oswald, like all Marine recruits, received training on the rifle range at distances up to five hundred yards,
|
117 |
+
LJ039-0149|established that they had been previously loaded and ejected from the assassination rifle,|established that they had been previously loaded and ejected from the assassination rifle,
|
118 |
+
LJ040-0107|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of 5 and 7 years,|but apparently was not able to spend as much time with them as he would have liked, because of the age gaps of five and seven years,
|
119 |
+
LJ040-0119|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.|When Pic returned home, Mrs. Oswald tried to play down the event but Mrs. Pic took a different view and asked the Oswalds to leave.
|
120 |
+
LJ040-0161|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.|Dr. Hartogs recommended that Oswald be placed on probation on condition that he seek help and guidance through a child guidance clinic.
|
121 |
+
LJ040-0169|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone|She observed that since Lee's mother worked all day, he made his own meals and spent all his time alone
|
122 |
+
LJ041-0098|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.|All the Marine Corps did was to teach you to kill and after you got out of the Marines you might be good gangsters, end quote.
|
123 |
+
LJ042-0017|and see for himself how a revolutionary society operates, a Marxist society.|and see for himself how a revolutionary society operates, a Marxist society.
|
124 |
+
LJ042-0070|Oswald was discovered in time to thwart his attempt at suicide.|Oswald was discovered in time to thwart his attempt at suicide.
|
125 |
+
LJ042-0161|Immediately after serving out his 3 years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.|Immediately after serving out his three years in the U.S. Marine Corps, he abandoned his American life to seek a new life in the USSR.
|
126 |
+
LJ043-0147|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.|He had left a note for his wife telling her what to do in case he were apprehended, as well as his notebook and the pictures of himself holding the rifle.
|
127 |
+
LJ043-0178|as, in fact, one of them did appear after the assassination.|as, in fact, one of them did appear after the assassination.
|
128 |
+
LJ043-0183|Oswald did not lack the determination and other traits required|Oswald did not lack the determination and other traits required
|
129 |
+
LJ043-0185|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.|Some idea of what he thought was sufficient reason for such an act may be found in the nature of the motive that he stated for his attack on General Walker.
|
130 |
+
LJ044-0057|extensive investigation was not able to connect Oswald with that address, although it did develop the fact|extensive investigation was not able to connect Oswald with that address, although it did develop the fact
|
131 |
+
LJ044-0109|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.|It is good to know that movements in support of fair play for Cuba has developed in New Orleans as well as in other cities.
|
132 |
+
LJ045-0081|Although she denied it in some of her testimony before the Commission,|Although she denied it in some of her testimony before the Commission,
|
133 |
+
LJ045-0147|She asked Oswald, quote,|She asked Oswald, quote,
|
134 |
+
LJ045-0204|he had never found anything to which he felt he could really belong.|he had never found anything to which he felt he could really belong.
|
135 |
+
LJ046-0193|and 12 to 15 of these cases as highly dangerous risks.|and twelve to fifteen of these cases as highly dangerous risks.
|
136 |
+
LJ046-0244|PRS should have investigated and been prepared to guard against it.|PRS should have investigated and been prepared to guard against it.
|
137 |
+
LJ047-0059|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,|However, pursuant to a regular Bureau practice of interviewing certain immigrants from Iron Curtain countries,
|
138 |
+
LJ047-0142|The Bureau had no earlier information suggesting that Oswald had left the United States.|The Bureau had no earlier information suggesting that Oswald had left the United States.
|
139 |
+
LJ048-0035|It was against this background and consistent with the criteria followed by the FBI prior to November 22|It was against this background and consistent with the criteria followed by the FBI prior to November twenty-two
|
140 |
+
LJ048-0063|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.|The formal FBI instructions to its agents outlining the information to be referred to the Secret Service were too narrow at the time of the assassination.
|
141 |
+
LJ048-0104|There were far safer routes via freeways directly to the Trade Mart,|There were far safer routes via freeways directly to the Trade Mart,
|
142 |
+
LJ048-0187|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.|In addition, Secret Service agents riding in the motorcade were trained to scan buildings as part of their general observation of the crowd of spectators.
|
143 |
+
LJ048-0271|will be cause for removal from the Service, end quote.|will be cause for removal from the Service, end quote.
|
144 |
+
LJ049-0031|The Presidential vehicle in use in Dallas, described in chapter 2,|The Presidential vehicle in use in Dallas, described in chapter two,
|
145 |
+
LJ049-0059|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,|Agents are instructed that it is not their responsibility to investigate or evaluate a present danger,
|
146 |
+
LJ049-0174|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated|to notify the Secret Service of the substantial information about Lee Harvey Oswald which the FBI had accumulated
|
147 |
+
LJ050-0049|and from a specialist in psychiatric prognostication at Walter Reed Hospital.|and from a specialist in psychiatric prognostication at Walter Reed Hospital.
|
148 |
+
LJ050-0113|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,|Such agreements should describe in detail the information which is sought, the manner in which it will be provided to the Secret Service,
|
149 |
+
LJ050-0150|Its present manual filing system is obsolete;|Its present manual filing system is obsolete;
|
150 |
+
LJ050-0189|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.|that written instructions might come into the hands of local newspapers, to the prejudice of the precautions described.
|
hifigan/README.md
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
|
2 |
+
|
3 |
+
### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
|
4 |
+
|
5 |
+
In our [paper](https://arxiv.org/abs/2010.05646),
|
6 |
+
we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
|
7 |
+
We provide our implementation and pretrained models as open source in this repository.
|
8 |
+
|
9 |
+
**Abstract :**
|
10 |
+
Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
|
11 |
+
Although such methods improve the sampling efficiency and memory usage,
|
12 |
+
their sample quality has not yet reached that of autoregressive and flow-based generative models.
|
13 |
+
In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
|
14 |
+
As speech audio consists of sinusoidal signals with various periods,
|
15 |
+
we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
|
16 |
+
A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
|
17 |
+
demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
|
18 |
+
real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
|
19 |
+
speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
|
20 |
+
faster than real-time on CPU with comparable quality to an autoregressive counterpart.
|
21 |
+
|
22 |
+
Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
|
23 |
+
|
24 |
+
|
25 |
+
## Pre-requisites
|
26 |
+
1. Python >= 3.6
|
27 |
+
2. Clone this repository.
|
28 |
+
3. Install python requirements. Please refer [requirements.txt](requirements.txt)
|
29 |
+
4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
|
30 |
+
And move all wav files to `LJSpeech-1.1/wavs`
|
31 |
+
|
32 |
+
|
33 |
+
## Training
|
34 |
+
```
|
35 |
+
python train.py --config config_v1.json
|
36 |
+
```
|
37 |
+
To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
|
38 |
+
Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
|
39 |
+
You can change the path by adding `--checkpoint_path` option.
|
40 |
+
|
41 |
+
Validation loss during training with V1 generator.<br>
|
42 |
+
![validation loss](./validation_loss.png)
|
43 |
+
|
44 |
+
## Pretrained Model
|
45 |
+
You can also use pretrained models we provide.<br/>
|
46 |
+
[Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
|
47 |
+
Details of each folder are as in follows:
|
48 |
+
|
49 |
+
|Folder Name|Generator|Dataset|Fine-Tuned|
|
50 |
+
|------|---|---|---|
|
51 |
+
|LJ_V1|V1|LJSpeech|No|
|
52 |
+
|LJ_V2|V2|LJSpeech|No|
|
53 |
+
|LJ_V3|V3|LJSpeech|No|
|
54 |
+
|LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
|
55 |
+
|LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
|
56 |
+
|LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
|
57 |
+
|VCTK_V1|V1|VCTK|No|
|
58 |
+
|VCTK_V2|V2|VCTK|No|
|
59 |
+
|VCTK_V3|V3|VCTK|No|
|
60 |
+
|UNIVERSAL_V1|V1|Universal|No|
|
61 |
+
|
62 |
+
We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
|
63 |
+
|
64 |
+
## Fine-Tuning
|
65 |
+
1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
|
66 |
+
The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
|
67 |
+
Example:
|
68 |
+
```
|
69 |
+
Audio File : LJ001-0001.wav
|
70 |
+
Mel-Spectrogram File : LJ001-0001.npy
|
71 |
+
```
|
72 |
+
2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
|
73 |
+
3. Run the following command.
|
74 |
+
```
|
75 |
+
python train.py --fine_tuning True --config config_v1.json
|
76 |
+
```
|
77 |
+
For other command line options, please refer to the training section.
|
78 |
+
|
79 |
+
|
80 |
+
## Inference from wav file
|
81 |
+
1. Make `test_files` directory and copy wav files into the directory.
|
82 |
+
2. Run the following command.
|
83 |
+
```
|
84 |
+
python inference.py --checkpoint_file [generator checkpoint file path]
|
85 |
+
```
|
86 |
+
Generated wav files are saved in `generated_files` by default.<br>
|
87 |
+
You can change the path by adding `--output_dir` option.
|
88 |
+
|
89 |
+
|
90 |
+
## Inference for end-to-end speech synthesis
|
91 |
+
1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
|
92 |
+
You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
|
93 |
+
[Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
|
94 |
+
2. Run the following command.
|
95 |
+
```
|
96 |
+
python inference_e2e.py --checkpoint_file [generator checkpoint file path]
|
97 |
+
```
|
98 |
+
Generated wav files are saved in `generated_files_from_mel` by default.<br>
|
99 |
+
You can change the path by adding `--output_dir` option.
|
100 |
+
|
101 |
+
|
102 |
+
## Acknowledgements
|
103 |
+
We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
|
104 |
+
and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
|
105 |
+
|
hifigan/__init__.py
ADDED
File without changes
|
hifigan/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (131 Bytes). View file
|
|
hifigan/__pycache__/env.cpython-311.pyc
ADDED
Binary file (1.32 kB). View file
|
|
hifigan/__pycache__/env.cpython-37.pyc
ADDED
Binary file (749 Bytes). View file
|
|
hifigan/__pycache__/env.cpython-39.pyc
ADDED
Binary file (785 Bytes). View file
|
|
hifigan/__pycache__/meldataset.cpython-311.pyc
ADDED
Binary file (11.7 kB). View file
|
|
hifigan/__pycache__/meldataset.cpython-37.pyc
ADDED
Binary file (5.38 kB). View file
|
|
hifigan/__pycache__/meldataset.cpython-38.pyc
ADDED
Binary file (5.45 kB). View file
|
|
hifigan/__pycache__/meldataset.cpython-39.pyc
ADDED
Binary file (5.46 kB). View file
|
|
hifigan/__pycache__/models.cpython-311.pyc
ADDED
Binary file (19.1 kB). View file
|
|
hifigan/__pycache__/models.cpython-37.pyc
ADDED
Binary file (8.9 kB). View file
|
|
hifigan/__pycache__/models.cpython-39.pyc
ADDED
Binary file (8.7 kB). View file
|
|
hifigan/__pycache__/utils.cpython-311.pyc
ADDED
Binary file (3.46 kB). View file
|
|
hifigan/__pycache__/utils.cpython-37.pyc
ADDED
Binary file (1.88 kB). View file
|
|
hifigan/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (1.99 kB). View file
|
|
hifigan/config.yaml
ADDED
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/tuning/train_fastspeech2.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: exp/tts_train_fastspeech2_raw_char_None
|
7 |
+
ngpu: 1
|
8 |
+
seed: 0
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: 3
|
14 |
+
dist_rank: 0
|
15 |
+
local_rank: 0
|
16 |
+
dist_master_addr: localhost
|
17 |
+
dist_master_port: 52297
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: true
|
20 |
+
unused_parameters: false
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: true
|
25 |
+
collect_stats: false
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 1000
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - valid
|
38 |
+
- loss
|
39 |
+
- min
|
40 |
+
- - train
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
keep_nbest_models: 5
|
44 |
+
grad_clip: 1.0
|
45 |
+
grad_clip_type: 2.0
|
46 |
+
grad_noise: false
|
47 |
+
accum_grad: 8
|
48 |
+
no_forward_run: false
|
49 |
+
resume: true
|
50 |
+
train_dtype: float32
|
51 |
+
use_amp: false
|
52 |
+
log_interval: null
|
53 |
+
use_tensorboard: true
|
54 |
+
use_wandb: false
|
55 |
+
wandb_project: null
|
56 |
+
wandb_id: null
|
57 |
+
wandb_entity: null
|
58 |
+
wandb_name: null
|
59 |
+
wandb_model_log_interval: -1
|
60 |
+
detect_anomaly: false
|
61 |
+
pretrain_path: null
|
62 |
+
init_param: []
|
63 |
+
ignore_init_mismatch: false
|
64 |
+
freeze_param: []
|
65 |
+
num_iters_per_epoch: 800
|
66 |
+
batch_size: 20
|
67 |
+
valid_batch_size: null
|
68 |
+
batch_bins: 3000000
|
69 |
+
valid_batch_bins: null
|
70 |
+
train_shape_file:
|
71 |
+
- exp/tts_stats_raw_char_None/train/text_shape.char
|
72 |
+
- exp/tts_stats_raw_char_None/train/speech_shape
|
73 |
+
valid_shape_file:
|
74 |
+
- exp/tts_stats_raw_char_None/valid/text_shape.char
|
75 |
+
- exp/tts_stats_raw_char_None/valid/speech_shape
|
76 |
+
batch_type: numel
|
77 |
+
valid_batch_type: null
|
78 |
+
fold_length:
|
79 |
+
- 150
|
80 |
+
- 204800
|
81 |
+
sort_in_batch: descending
|
82 |
+
sort_batch: descending
|
83 |
+
multiple_iterator: false
|
84 |
+
chunk_length: 500
|
85 |
+
chunk_shift_ratio: 0.5
|
86 |
+
num_cache_chunks: 1024
|
87 |
+
train_data_path_and_name_and_type:
|
88 |
+
- - dump/raw/tr_no_dev/text
|
89 |
+
- text
|
90 |
+
- text
|
91 |
+
- - exp/tts_train_raw_char_None/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
|
92 |
+
- durations
|
93 |
+
- text_int
|
94 |
+
- - dump/raw/tr_no_dev/wav.scp
|
95 |
+
- speech
|
96 |
+
- sound
|
97 |
+
valid_data_path_and_name_and_type:
|
98 |
+
- - dump/raw/dev/text
|
99 |
+
- text
|
100 |
+
- text
|
101 |
+
- - exp/tts_train_raw_char_None/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
|
102 |
+
- durations
|
103 |
+
- text_int
|
104 |
+
- - dump/raw/dev/wav.scp
|
105 |
+
- speech
|
106 |
+
- sound
|
107 |
+
allow_variable_data_keys: false
|
108 |
+
max_cache_size: 0.0
|
109 |
+
max_cache_fd: 32
|
110 |
+
valid_max_cache_size: null
|
111 |
+
optim: adam
|
112 |
+
optim_conf:
|
113 |
+
lr: 1.0
|
114 |
+
scheduler: noamlr
|
115 |
+
scheduler_conf:
|
116 |
+
model_size: 384
|
117 |
+
warmup_steps: 4000
|
118 |
+
token_list:
|
119 |
+
- <blank>
|
120 |
+
- <unk>
|
121 |
+
- <space>
|
122 |
+
- a
|
123 |
+
- A
|
124 |
+
- E
|
125 |
+
- k
|
126 |
+
- r
|
127 |
+
- I
|
128 |
+
- n
|
129 |
+
- s
|
130 |
+
- h
|
131 |
+
- i
|
132 |
+
- q
|
133 |
+
- t
|
134 |
+
- m
|
135 |
+
- o
|
136 |
+
- l
|
137 |
+
- p
|
138 |
+
- u
|
139 |
+
- y
|
140 |
+
- b
|
141 |
+
- d
|
142 |
+
- w
|
143 |
+
- ऐ
|
144 |
+
- g
|
145 |
+
- j
|
146 |
+
- c
|
147 |
+
- ट
|
148 |
+
- थ
|
149 |
+
- श
|
150 |
+
- U
|
151 |
+
- B
|
152 |
+
- औ
|
153 |
+
- ख
|
154 |
+
- ड
|
155 |
+
- z
|
156 |
+
- ध
|
157 |
+
- D
|
158 |
+
- f
|
159 |
+
- C
|
160 |
+
- M
|
161 |
+
- ष
|
162 |
+
- ण
|
163 |
+
- ठ
|
164 |
+
- J
|
165 |
+
- घ
|
166 |
+
- ऑ
|
167 |
+
- P
|
168 |
+
- क
|
169 |
+
- R
|
170 |
+
- T
|
171 |
+
- K
|
172 |
+
- ढ
|
173 |
+
- G
|
174 |
+
- ञ
|
175 |
+
- H
|
176 |
+
- ङ
|
177 |
+
- Y
|
178 |
+
- ऍ
|
179 |
+
- र
|
180 |
+
- <sos/eos>
|
181 |
+
odim: null
|
182 |
+
model_conf: {}
|
183 |
+
use_preprocessor: true
|
184 |
+
token_type: char
|
185 |
+
bpemodel: null
|
186 |
+
non_linguistic_symbols: null
|
187 |
+
cleaner: null
|
188 |
+
g2p: g2p_en_no_space
|
189 |
+
feats_extract: fbank
|
190 |
+
feats_extract_conf:
|
191 |
+
n_fft: 1024
|
192 |
+
hop_length: 256
|
193 |
+
win_length: 1024
|
194 |
+
fs: 22050
|
195 |
+
fmin: 0
|
196 |
+
fmax: 8000
|
197 |
+
n_mels: 80
|
198 |
+
normalize: global_mvn
|
199 |
+
normalize_conf:
|
200 |
+
stats_file: /var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/feats_stats.npz
|
201 |
+
tts: fastspeech2
|
202 |
+
tts_conf:
|
203 |
+
adim: 384
|
204 |
+
aheads: 2
|
205 |
+
elayers: 4
|
206 |
+
eunits: 1536
|
207 |
+
dlayers: 4
|
208 |
+
dunits: 1536
|
209 |
+
positionwise_layer_type: conv1d
|
210 |
+
positionwise_conv_kernel_size: 3
|
211 |
+
duration_predictor_layers: 2
|
212 |
+
duration_predictor_chans: 256
|
213 |
+
duration_predictor_kernel_size: 3
|
214 |
+
postnet_layers: 5
|
215 |
+
postnet_filts: 5
|
216 |
+
postnet_chans: 256
|
217 |
+
use_masking: true
|
218 |
+
use_scaled_pos_enc: true
|
219 |
+
encoder_normalize_before: true
|
220 |
+
decoder_normalize_before: true
|
221 |
+
reduction_factor: 1
|
222 |
+
init_type: xavier_uniform
|
223 |
+
init_enc_alpha: 1.0
|
224 |
+
init_dec_alpha: 1.0
|
225 |
+
transformer_enc_dropout_rate: 0.2
|
226 |
+
transformer_enc_positional_dropout_rate: 0.2
|
227 |
+
transformer_enc_attn_dropout_rate: 0.2
|
228 |
+
transformer_dec_dropout_rate: 0.2
|
229 |
+
transformer_dec_positional_dropout_rate: 0.2
|
230 |
+
transformer_dec_attn_dropout_rate: 0.2
|
231 |
+
pitch_predictor_layers: 5
|
232 |
+
pitch_predictor_chans: 256
|
233 |
+
pitch_predictor_kernel_size: 5
|
234 |
+
pitch_predictor_dropout: 0.5
|
235 |
+
pitch_embed_kernel_size: 1
|
236 |
+
pitch_embed_dropout: 0.0
|
237 |
+
stop_gradient_from_pitch_predictor: true
|
238 |
+
energy_predictor_layers: 2
|
239 |
+
energy_predictor_chans: 256
|
240 |
+
energy_predictor_kernel_size: 3
|
241 |
+
energy_predictor_dropout: 0.5
|
242 |
+
energy_embed_kernel_size: 1
|
243 |
+
energy_embed_dropout: 0.0
|
244 |
+
stop_gradient_from_energy_predictor: false
|
245 |
+
pitch_extract: dio
|
246 |
+
pitch_extract_conf:
|
247 |
+
fs: 22050
|
248 |
+
n_fft: 1024
|
249 |
+
hop_length: 256
|
250 |
+
f0max: 350
|
251 |
+
f0min: 40
|
252 |
+
reduction_factor: 1
|
253 |
+
pitch_normalize: global_mvn
|
254 |
+
pitch_normalize_conf:
|
255 |
+
stats_file: /var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/pitch_stats.npz
|
256 |
+
energy_extract: energy
|
257 |
+
energy_extract_conf:
|
258 |
+
fs: 22050
|
259 |
+
n_fft: 1024
|
260 |
+
hop_length: 256
|
261 |
+
win_length: 1024
|
262 |
+
reduction_factor: 1
|
263 |
+
energy_normalize: global_mvn
|
264 |
+
energy_normalize_conf:
|
265 |
+
stats_file: /var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/energy_stats.npz
|
266 |
+
required:
|
267 |
+
- output_dir
|
268 |
+
- token_list
|
269 |
+
version: 0.10.3a3
|
270 |
+
distributed: true
|
hifigan/config_v1.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"resblock": "1",
|
3 |
+
"num_gpus": 0,
|
4 |
+
"batch_size": 16,
|
5 |
+
"learning_rate": 0.0002,
|
6 |
+
"adam_b1": 0.8,
|
7 |
+
"adam_b2": 0.99,
|
8 |
+
"lr_decay": 0.999,
|
9 |
+
"seed": 1234,
|
10 |
+
|
11 |
+
"upsample_rates": [8,8,2,2],
|
12 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
13 |
+
"upsample_initial_channel": 512,
|
14 |
+
"resblock_kernel_sizes": [3,7,11],
|
15 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
16 |
+
|
17 |
+
"segment_size": 8192,
|
18 |
+
"num_mels": 80,
|
19 |
+
"num_freq": 1025,
|
20 |
+
"n_fft": 1024,
|
21 |
+
"hop_size": 256,
|
22 |
+
"win_size": 1024,
|
23 |
+
|
24 |
+
"sampling_rate": 22050,
|
25 |
+
|
26 |
+
"fmin": 0,
|
27 |
+
"fmax": 8000,
|
28 |
+
"fmax_for_loss": null,
|
29 |
+
|
30 |
+
"num_workers": 4,
|
31 |
+
|
32 |
+
"dist_config": {
|
33 |
+
"dist_backend": "nccl",
|
34 |
+
"dist_url": "tcp://localhost:54321",
|
35 |
+
"world_size": 1
|
36 |
+
}
|
37 |
+
}
|
hifigan/config_v2.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"resblock": "1",
|
3 |
+
"num_gpus": 0,
|
4 |
+
"batch_size": 16,
|
5 |
+
"learning_rate": 0.0002,
|
6 |
+
"adam_b1": 0.8,
|
7 |
+
"adam_b2": 0.99,
|
8 |
+
"lr_decay": 0.999,
|
9 |
+
"seed": 1234,
|
10 |
+
|
11 |
+
"upsample_rates": [8,8,2,2],
|
12 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
13 |
+
"upsample_initial_channel": 128,
|
14 |
+
"resblock_kernel_sizes": [3,7,11],
|
15 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
16 |
+
|
17 |
+
"segment_size": 8192,
|
18 |
+
"num_mels": 80,
|
19 |
+
"num_freq": 1025,
|
20 |
+
"n_fft": 1024,
|
21 |
+
"hop_size": 256,
|
22 |
+
"win_size": 1024,
|
23 |
+
|
24 |
+
"sampling_rate": 22050,
|
25 |
+
|
26 |
+
"fmin": 0,
|
27 |
+
"fmax": 8000,
|
28 |
+
"fmax_for_loss": null,
|
29 |
+
|
30 |
+
"num_workers": 4,
|
31 |
+
|
32 |
+
"dist_config": {
|
33 |
+
"dist_backend": "nccl",
|
34 |
+
"dist_url": "tcp://localhost:54321",
|
35 |
+
"world_size": 1
|
36 |
+
}
|
37 |
+
}
|
hifigan/config_v3.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"resblock": "2",
|
3 |
+
"num_gpus": 0,
|
4 |
+
"batch_size": 16,
|
5 |
+
"learning_rate": 0.0002,
|
6 |
+
"adam_b1": 0.8,
|
7 |
+
"adam_b2": 0.99,
|
8 |
+
"lr_decay": 0.999,
|
9 |
+
"seed": 1234,
|
10 |
+
|
11 |
+
"upsample_rates": [8,8,4],
|
12 |
+
"upsample_kernel_sizes": [16,16,8],
|
13 |
+
"upsample_initial_channel": 256,
|
14 |
+
"resblock_kernel_sizes": [3,5,7],
|
15 |
+
"resblock_dilation_sizes": [[1,2], [2,6], [3,12]],
|
16 |
+
|
17 |
+
"segment_size": 8192,
|
18 |
+
"num_mels": 80,
|
19 |
+
"num_freq": 1025,
|
20 |
+
"n_fft": 1024,
|
21 |
+
"hop_size": 256,
|
22 |
+
"win_size": 1024,
|
23 |
+
|
24 |
+
"sampling_rate": 22050,
|
25 |
+
|
26 |
+
"fmin": 0,
|
27 |
+
"fmax": 8000,
|
28 |
+
"fmax_for_loss": null,
|
29 |
+
|
30 |
+
"num_workers": 4,
|
31 |
+
|
32 |
+
"dist_config": {
|
33 |
+
"dist_backend": "nccl",
|
34 |
+
"dist_url": "tcp://localhost:54321",
|
35 |
+
"world_size": 1
|
36 |
+
}
|
37 |
+
}
|
hifigan/denorm/test_243.npy.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1eb0656828dbbba211b8646a55909806cf622c6c0c4969abc12433b49fe674cb
|
3 |
+
size 70730
|
hifigan/env.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
|
4 |
+
|
5 |
+
class AttrDict(dict):
|
6 |
+
def __init__(self, *args, **kwargs):
|
7 |
+
super(AttrDict, self).__init__(*args, **kwargs)
|
8 |
+
self.__dict__ = self
|
9 |
+
|
10 |
+
|
11 |
+
def build_env(config, config_name, path):
|
12 |
+
t_path = os.path.join(path, config_name)
|
13 |
+
if config != t_path:
|
14 |
+
os.makedirs(path, exist_ok=True)
|
15 |
+
shutil.copyfile(config, os.path.join(path, config_name))
|
hifigan/fs2_speed.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
loading model in cpu
|
2 |
+
Run 0
|
3 |
+
Elapsed time: 0.38950467109680176
|
4 |
+
Run 1
|
5 |
+
Elapsed time: 0.1787424087524414
|
6 |
+
Run 2
|
7 |
+
Elapsed time: 0.18103241920471191
|
8 |
+
Run 3
|
9 |
+
Elapsed time: 0.18195390701293945
|
10 |
+
Run 4
|
11 |
+
Elapsed time: 0.18042469024658203
|
12 |
+
-----------------------------
|
13 |
+
loading model in cuda
|
14 |
+
Run 0
|
15 |
+
Elapsed time: 84.3974118232727
|
16 |
+
Run 1
|
17 |
+
Elapsed time: 0.12549662590026855
|
18 |
+
Run 2
|
19 |
+
Elapsed time: 0.12475895881652832
|
20 |
+
Run 3
|
21 |
+
Elapsed time: 0.12504363059997559
|
22 |
+
Run 4
|
23 |
+
Elapsed time: 0.12546324729919434
|
24 |
+
-----------------------------
|
hifigan/gen.wav
ADDED
Binary file (262 kB). View file
|
|
hifigan/griffin.wav
ADDED
Binary file (176 kB). View file
|
|
hifigan/hifigan_speed.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
loading model in cpu
|
2 |
+
Removing weight norm...
|
3 |
+
Run 0
|
4 |
+
Elapsed time: 0.453446626663208
|
5 |
+
Elapsed time: 2.7247982025146484
|
6 |
+
Elapsed time: 5.03496241569519
|
7 |
+
Run 1
|
8 |
+
Elapsed time: 0.5230855941772461
|
9 |
+
Elapsed time: 2.5505268573760986
|
10 |
+
Elapsed time: 4.904325246810913
|
11 |
+
Run 2
|
12 |
+
Elapsed time: 0.5279533863067627
|
13 |
+
Elapsed time: 2.5415592193603516
|
14 |
+
Elapsed time: 4.775323390960693
|
15 |
+
loading model in cuda
|
16 |
+
Removing weight norm...
|
17 |
+
Run 0
|
18 |
+
Elapsed time: 116.25620722770691
|
19 |
+
Elapsed time: 0.08193731307983398
|
20 |
+
Elapsed time: 0.15532135963439941
|
21 |
+
Run 1
|
22 |
+
Elapsed time: 0.020008563995361328
|
23 |
+
Elapsed time: 0.07747459411621094
|
24 |
+
Elapsed time: 0.1503896713256836
|
25 |
+
Run 2
|
26 |
+
Elapsed time: 0.019192934036254883
|
27 |
+
Elapsed time: 0.07719159126281738
|
28 |
+
Elapsed time: 0.15003252029418945
|
hifigan/inference.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
2 |
+
|
3 |
+
import glob
|
4 |
+
import os
|
5 |
+
import argparse
|
6 |
+
import json
|
7 |
+
import torch
|
8 |
+
from scipy.io.wavfile import write
|
9 |
+
from env import AttrDict
|
10 |
+
from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
|
11 |
+
from models import Generator
|
12 |
+
|
13 |
+
h = None
|
14 |
+
device = None
|
15 |
+
|
16 |
+
|
17 |
+
def load_checkpoint(filepath, device):
|
18 |
+
assert os.path.isfile(filepath)
|
19 |
+
print("Loading '{}'".format(filepath))
|
20 |
+
checkpoint_dict = torch.load(filepath, map_location=device)
|
21 |
+
print("Complete.")
|
22 |
+
return checkpoint_dict
|
23 |
+
|
24 |
+
|
25 |
+
def get_mel(x):
|
26 |
+
return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
|
27 |
+
|
28 |
+
|
29 |
+
def scan_checkpoint(cp_dir, prefix):
|
30 |
+
pattern = os.path.join(cp_dir, prefix + '*')
|
31 |
+
cp_list = glob.glob(pattern)
|
32 |
+
if len(cp_list) == 0:
|
33 |
+
return ''
|
34 |
+
return sorted(cp_list)[-1]
|
35 |
+
|
36 |
+
|
37 |
+
def inference(a):
|
38 |
+
generator = Generator(h).to(device)
|
39 |
+
|
40 |
+
state_dict_g = load_checkpoint(a.checkpoint_file, device)
|
41 |
+
generator.load_state_dict(state_dict_g['generator'])
|
42 |
+
|
43 |
+
filelist = os.listdir(a.input_wavs_dir)
|
44 |
+
|
45 |
+
os.makedirs(a.output_dir, exist_ok=True)
|
46 |
+
|
47 |
+
generator.eval()
|
48 |
+
generator.remove_weight_norm()
|
49 |
+
with torch.no_grad():
|
50 |
+
for i, filname in enumerate(filelist):
|
51 |
+
wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname))
|
52 |
+
wav = wav / MAX_WAV_VALUE
|
53 |
+
wav = torch.FloatTensor(wav).to(device)
|
54 |
+
x = get_mel(wav.unsqueeze(0))
|
55 |
+
y_g_hat = generator(x)
|
56 |
+
audio = y_g_hat.squeeze()
|
57 |
+
audio = audio * MAX_WAV_VALUE
|
58 |
+
audio = audio.cpu().numpy().astype('int16')
|
59 |
+
|
60 |
+
output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav')
|
61 |
+
write(output_file, h.sampling_rate, audio)
|
62 |
+
print(output_file)
|
63 |
+
|
64 |
+
|
65 |
+
def main():
|
66 |
+
print('Initializing Inference Process..')
|
67 |
+
|
68 |
+
parser = argparse.ArgumentParser()
|
69 |
+
parser.add_argument('--input_wavs_dir', default='test_files')
|
70 |
+
parser.add_argument('--output_dir', default='generated_files')
|
71 |
+
parser.add_argument('--checkpoint_file', required=True)
|
72 |
+
a = parser.parse_args()
|
73 |
+
|
74 |
+
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
|
75 |
+
with open(config_file) as f:
|
76 |
+
data = f.read()
|
77 |
+
|
78 |
+
global h
|
79 |
+
json_config = json.loads(data)
|
80 |
+
h = AttrDict(json_config)
|
81 |
+
|
82 |
+
torch.manual_seed(h.seed)
|
83 |
+
global device
|
84 |
+
if torch.cuda.is_available():
|
85 |
+
torch.cuda.manual_seed(h.seed)
|
86 |
+
device = torch.device('cuda')
|
87 |
+
else:
|
88 |
+
device = torch.device('cpu')
|
89 |
+
|
90 |
+
inference(a)
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == '__main__':
|
94 |
+
main()
|
95 |
+
|
hifigan/inference_e2e.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
2 |
+
|
3 |
+
import glob
|
4 |
+
import os
|
5 |
+
import numpy as np
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import torch
|
9 |
+
from scipy.io.wavfile import write
|
10 |
+
from env import AttrDict
|
11 |
+
from meldataset import MAX_WAV_VALUE
|
12 |
+
from models import Generator
|
13 |
+
|
14 |
+
h = None
|
15 |
+
device = None
|
16 |
+
|
17 |
+
|
18 |
+
def load_checkpoint(filepath, device):
|
19 |
+
assert os.path.isfile(filepath)
|
20 |
+
print("Loading '{}'".format(filepath))
|
21 |
+
checkpoint_dict = torch.load(filepath, map_location=device)
|
22 |
+
print("Complete.")
|
23 |
+
return checkpoint_dict
|
24 |
+
|
25 |
+
|
26 |
+
def scan_checkpoint(cp_dir, prefix):
|
27 |
+
pattern = os.path.join(cp_dir, prefix + '*')
|
28 |
+
cp_list = glob.glob(pattern)
|
29 |
+
if len(cp_list) == 0:
|
30 |
+
return ''
|
31 |
+
return sorted(cp_list)[-1]
|
32 |
+
|
33 |
+
|
34 |
+
def inference(a):
|
35 |
+
generator = Generator(h).to(device)
|
36 |
+
|
37 |
+
state_dict_g = load_checkpoint(a.checkpoint_file, device)
|
38 |
+
generator.load_state_dict(state_dict_g['generator'])
|
39 |
+
|
40 |
+
filelist = os.listdir(a.input_mels_dir)
|
41 |
+
|
42 |
+
os.makedirs(a.output_dir, exist_ok=True)
|
43 |
+
|
44 |
+
generator.eval()
|
45 |
+
generator.remove_weight_norm()
|
46 |
+
with torch.no_grad():
|
47 |
+
for i, filname in enumerate(filelist):
|
48 |
+
x = np.load(os.path.join(a.input_mels_dir, filname))
|
49 |
+
x = torch.FloatTensor(x).to(device)
|
50 |
+
y_g_hat = generator(x)
|
51 |
+
audio = y_g_hat.squeeze()
|
52 |
+
audio = audio * MAX_WAV_VALUE
|
53 |
+
audio = audio.cpu().numpy().astype('int16')
|
54 |
+
|
55 |
+
output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav')
|
56 |
+
write(output_file, h.sampling_rate, audio)
|
57 |
+
print(output_file)
|
58 |
+
|
59 |
+
|
60 |
+
def main():
|
61 |
+
print('Initializing Inference Process..')
|
62 |
+
|
63 |
+
parser = argparse.ArgumentParser()
|
64 |
+
parser.add_argument('--input_mels_dir', default='test_mel_files')
|
65 |
+
parser.add_argument('--output_dir', default='generated_files_from_mel')
|
66 |
+
parser.add_argument('--checkpoint_file', required=True)
|
67 |
+
a = parser.parse_args()
|
68 |
+
|
69 |
+
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
|
70 |
+
with open(config_file) as f:
|
71 |
+
data = f.read()
|
72 |
+
|
73 |
+
global h
|
74 |
+
json_config = json.loads(data)
|
75 |
+
h = AttrDict(json_config)
|
76 |
+
|
77 |
+
torch.manual_seed(h.seed)
|
78 |
+
global device
|
79 |
+
if torch.cuda.is_available():
|
80 |
+
torch.cuda.manual_seed(h.seed)
|
81 |
+
device = torch.device('cuda')
|
82 |
+
else:
|
83 |
+
device = torch.device('cpu')
|
84 |
+
|
85 |
+
inference(a)
|
86 |
+
|
87 |
+
|
88 |
+
if __name__ == '__main__':
|
89 |
+
main()
|
90 |
+
|
hifigan/inference_from_espnet.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
2 |
+
import glob
|
3 |
+
import os
|
4 |
+
import argparse
|
5 |
+
import json
|
6 |
+
import torch
|
7 |
+
import numpy as np
|
8 |
+
from scipy.io.wavfile import write
|
9 |
+
from env import AttrDict
|
10 |
+
from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
|
11 |
+
from models import Generator
|
12 |
+
import time
|
13 |
+
|
14 |
+
h = None
|
15 |
+
device = "cpu"
|
16 |
+
|
17 |
+
|
18 |
+
def load_checkpoint(filepath, device):
|
19 |
+
assert os.path.isfile(filepath)
|
20 |
+
print("Loading '{}'".format(filepath))
|
21 |
+
checkpoint_dict = torch.load(filepath, map_location=device)
|
22 |
+
print("Complete.")
|
23 |
+
return checkpoint_dict
|
24 |
+
|
25 |
+
|
26 |
+
def get_mel(x):
|
27 |
+
return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
|
28 |
+
|
29 |
+
|
30 |
+
def scan_checkpoint(cp_dir, prefix):
|
31 |
+
pattern = os.path.join(cp_dir, prefix + '*')
|
32 |
+
cp_list = glob.glob(pattern)
|
33 |
+
if len(cp_list) == 0:
|
34 |
+
return ''
|
35 |
+
return sorted(cp_list)[-1]
|
36 |
+
|
37 |
+
|
38 |
+
def inference(a):
|
39 |
+
generator = Generator(h).to(device)
|
40 |
+
|
41 |
+
state_dict_g = load_checkpoint(a.checkpoint_file, device)
|
42 |
+
generator.load_state_dict(state_dict_g['generator'])
|
43 |
+
|
44 |
+
filelist = os.listdir(a.input_wavs_dir)
|
45 |
+
|
46 |
+
os.makedirs(a.output_dir, exist_ok=True)
|
47 |
+
|
48 |
+
generator.eval()
|
49 |
+
generator.remove_weight_norm()
|
50 |
+
with torch.no_grad():
|
51 |
+
for i, filname in enumerate(filelist):
|
52 |
+
print(filname)
|
53 |
+
# wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname))
|
54 |
+
# wav = wav / MAX_WAV_VALUE
|
55 |
+
# wav = torch.FloatTensor(wav).to(device)
|
56 |
+
# x = get_mel(wav.unsqueeze(0))
|
57 |
+
# print("x is ", x.shape)
|
58 |
+
arr2 = torch.load(os.path.join(a.input_wavs_dir, filname))
|
59 |
+
print("arr2 type", type(arr2))
|
60 |
+
# arr = np.load(os.path.join(a.input_wavs_dir, filname))
|
61 |
+
arr = np.array(arr2).astype(float)
|
62 |
+
print("arr type", type(arr))
|
63 |
+
# arr = np.loadtxt(os.path.join(a.input_wavs_dir, filname),dtype='float')
|
64 |
+
if arr.shape[0]!=80:
|
65 |
+
arr = arr.T
|
66 |
+
print(arr.shape)
|
67 |
+
# arr = x.detach().cpu().numpy()
|
68 |
+
# print(arr.shape[0],arr.shape[1],arr.shape[2])
|
69 |
+
# arr_new = arr.reshape(arr.shape[1],arr.shape[2])
|
70 |
+
# print(arr_new.shape)
|
71 |
+
arr_new2 = arr.reshape(1,arr.shape[0],arr.shape[1])
|
72 |
+
###x_new = torch.from_numpy(arr_new2).float().to(device)
|
73 |
+
x_new = torch.FloatTensor(arr_new2).to(device)
|
74 |
+
print("x_new",x_new.shape)
|
75 |
+
# x = x_new
|
76 |
+
# np.savetxt('tests/' + filname + '.txt', arr_new)
|
77 |
+
# y_new = torch.from_numpy(arr.unsqueeze(0))
|
78 |
+
# print(y_new.shape)
|
79 |
+
|
80 |
+
st = time.time()
|
81 |
+
y_g_hat = generator(x_new)
|
82 |
+
et = time.time()
|
83 |
+
print("Time taken by generator:", (et-st))
|
84 |
+
audio = y_g_hat.squeeze()
|
85 |
+
audio = audio * MAX_WAV_VALUE
|
86 |
+
audio = audio.cpu().numpy().astype('int16')
|
87 |
+
|
88 |
+
output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav')
|
89 |
+
write(output_file, h.sampling_rate, audio)
|
90 |
+
print(output_file)
|
91 |
+
|
92 |
+
|
93 |
+
def main():
|
94 |
+
print('Initializing Inference Process..')
|
95 |
+
|
96 |
+
parser = argparse.ArgumentParser()
|
97 |
+
parser.add_argument('--input_wavs_dir', default='denorm')
|
98 |
+
parser.add_argument('--output_dir', default='wav_folder')
|
99 |
+
parser.add_argument('--checkpoint_file', required=True)
|
100 |
+
a = parser.parse_args()
|
101 |
+
|
102 |
+
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
|
103 |
+
with open(config_file) as f:
|
104 |
+
data = f.read()
|
105 |
+
|
106 |
+
global h
|
107 |
+
json_config = json.loads(data)
|
108 |
+
h = AttrDict(json_config)
|
109 |
+
|
110 |
+
torch.manual_seed(h.seed)
|
111 |
+
global device
|
112 |
+
if device is None and torch.cuda.is_available():
|
113 |
+
torch.cuda.manual_seed(h.seed)
|
114 |
+
device = torch.device('cuda')
|
115 |
+
else:
|
116 |
+
device = torch.device('cpu')
|
117 |
+
|
118 |
+
print("device", device)
|
119 |
+
inference(a)
|
120 |
+
|
121 |
+
|
122 |
+
if __name__ == '__main__':
|
123 |
+
main()
|
124 |
+
|
hifigan/meldataset.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import torch
|
5 |
+
import torch.utils.data
|
6 |
+
import numpy as np
|
7 |
+
from librosa.util import normalize
|
8 |
+
from scipy.io.wavfile import read
|
9 |
+
from librosa.filters import mel as librosa_mel_fn
|
10 |
+
|
11 |
+
MAX_WAV_VALUE = 32768.0
|
12 |
+
|
13 |
+
|
14 |
+
def load_wav(full_path):
|
15 |
+
sampling_rate, data = read(full_path)
|
16 |
+
return data, sampling_rate
|
17 |
+
|
18 |
+
|
19 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
20 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
21 |
+
|
22 |
+
|
23 |
+
def dynamic_range_decompression(x, C=1):
|
24 |
+
return np.exp(x) / C
|
25 |
+
|
26 |
+
|
27 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
28 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
29 |
+
|
30 |
+
|
31 |
+
def dynamic_range_decompression_torch(x, C=1):
|
32 |
+
return torch.exp(x) / C
|
33 |
+
|
34 |
+
|
35 |
+
def spectral_normalize_torch(magnitudes):
|
36 |
+
output = dynamic_range_compression_torch(magnitudes)
|
37 |
+
return output
|
38 |
+
|
39 |
+
|
40 |
+
def spectral_de_normalize_torch(magnitudes):
|
41 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
42 |
+
return output
|
43 |
+
|
44 |
+
|
45 |
+
mel_basis = {}
|
46 |
+
hann_window = {}
|
47 |
+
|
48 |
+
|
49 |
+
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
50 |
+
if torch.min(y) < -1.:
|
51 |
+
print('min value is ', torch.min(y))
|
52 |
+
if torch.max(y) > 1.:
|
53 |
+
print('max value is ', torch.max(y))
|
54 |
+
|
55 |
+
global mel_basis, hann_window
|
56 |
+
if fmax not in mel_basis:
|
57 |
+
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
58 |
+
mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
59 |
+
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
60 |
+
|
61 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
62 |
+
y = y.squeeze(1)
|
63 |
+
|
64 |
+
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
|
65 |
+
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
66 |
+
|
67 |
+
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
|
68 |
+
|
69 |
+
spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
|
70 |
+
spec = spectral_normalize_torch(spec)
|
71 |
+
|
72 |
+
return spec
|
73 |
+
|
74 |
+
|
75 |
+
def get_dataset_filelist(a):
|
76 |
+
with open(a.input_training_file, 'r', encoding='utf-8') as fi:
|
77 |
+
training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
|
78 |
+
for x in fi.read().split('\n') if len(x) > 0]
|
79 |
+
|
80 |
+
with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
|
81 |
+
validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
|
82 |
+
for x in fi.read().split('\n') if len(x) > 0]
|
83 |
+
return training_files, validation_files
|
84 |
+
|
85 |
+
|
86 |
+
class MelDataset(torch.utils.data.Dataset):
|
87 |
+
def __init__(self, training_files, segment_size, n_fft, num_mels,
|
88 |
+
hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
|
89 |
+
device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
|
90 |
+
self.audio_files = training_files
|
91 |
+
random.seed(1234)
|
92 |
+
if shuffle:
|
93 |
+
random.shuffle(self.audio_files)
|
94 |
+
self.segment_size = segment_size
|
95 |
+
self.sampling_rate = sampling_rate
|
96 |
+
self.split = split
|
97 |
+
self.n_fft = n_fft
|
98 |
+
self.num_mels = num_mels
|
99 |
+
self.hop_size = hop_size
|
100 |
+
self.win_size = win_size
|
101 |
+
self.fmin = fmin
|
102 |
+
self.fmax = fmax
|
103 |
+
self.fmax_loss = fmax_loss
|
104 |
+
self.cached_wav = None
|
105 |
+
self.n_cache_reuse = n_cache_reuse
|
106 |
+
self._cache_ref_count = 0
|
107 |
+
self.device = device
|
108 |
+
self.fine_tuning = fine_tuning
|
109 |
+
self.base_mels_path = base_mels_path
|
110 |
+
|
111 |
+
def __getitem__(self, index):
|
112 |
+
filename = self.audio_files[index]
|
113 |
+
if self._cache_ref_count == 0:
|
114 |
+
audio, sampling_rate = load_wav(filename)
|
115 |
+
audio = audio / MAX_WAV_VALUE
|
116 |
+
if not self.fine_tuning:
|
117 |
+
audio = normalize(audio) * 0.95
|
118 |
+
self.cached_wav = audio
|
119 |
+
if sampling_rate != self.sampling_rate:
|
120 |
+
raise ValueError("{} SR doesn't match target {} SR".format(
|
121 |
+
sampling_rate, self.sampling_rate))
|
122 |
+
self._cache_ref_count = self.n_cache_reuse
|
123 |
+
else:
|
124 |
+
audio = self.cached_wav
|
125 |
+
self._cache_ref_count -= 1
|
126 |
+
|
127 |
+
audio = torch.FloatTensor(audio)
|
128 |
+
audio = audio.unsqueeze(0)
|
129 |
+
|
130 |
+
if not self.fine_tuning:
|
131 |
+
if self.split:
|
132 |
+
if audio.size(1) >= self.segment_size:
|
133 |
+
max_audio_start = audio.size(1) - self.segment_size
|
134 |
+
audio_start = random.randint(0, max_audio_start)
|
135 |
+
audio = audio[:, audio_start:audio_start+self.segment_size]
|
136 |
+
else:
|
137 |
+
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
138 |
+
|
139 |
+
mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
140 |
+
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
|
141 |
+
center=False)
|
142 |
+
else:
|
143 |
+
mel = np.load(
|
144 |
+
os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
|
145 |
+
mel = torch.from_numpy(mel)
|
146 |
+
|
147 |
+
if len(mel.shape) < 3:
|
148 |
+
mel = mel.unsqueeze(0)
|
149 |
+
|
150 |
+
if self.split:
|
151 |
+
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
|
152 |
+
|
153 |
+
if audio.size(1) >= self.segment_size:
|
154 |
+
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
|
155 |
+
mel = mel[:, :, mel_start:mel_start + frames_per_seg]
|
156 |
+
audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
|
157 |
+
else:
|
158 |
+
mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
|
159 |
+
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
160 |
+
|
161 |
+
mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
162 |
+
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
|
163 |
+
center=False)
|
164 |
+
|
165 |
+
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
|
166 |
+
|
167 |
+
def __len__(self):
|
168 |
+
return len(self.audio_files)
|
hifigan/models.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
5 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
6 |
+
from utils import init_weights, get_padding
|
7 |
+
|
8 |
+
LRELU_SLOPE = 0.1
|
9 |
+
|
10 |
+
|
11 |
+
class ResBlock1(torch.nn.Module):
|
12 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
13 |
+
super(ResBlock1, self).__init__()
|
14 |
+
self.h = h
|
15 |
+
self.convs1 = nn.ModuleList([
|
16 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
17 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
18 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
19 |
+
padding=get_padding(kernel_size, dilation[1]))),
|
20 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
21 |
+
padding=get_padding(kernel_size, dilation[2])))
|
22 |
+
])
|
23 |
+
self.convs1.apply(init_weights)
|
24 |
+
|
25 |
+
self.convs2 = nn.ModuleList([
|
26 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
27 |
+
padding=get_padding(kernel_size, 1))),
|
28 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
29 |
+
padding=get_padding(kernel_size, 1))),
|
30 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
31 |
+
padding=get_padding(kernel_size, 1)))
|
32 |
+
])
|
33 |
+
self.convs2.apply(init_weights)
|
34 |
+
|
35 |
+
def forward(self, x):
|
36 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
37 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
38 |
+
xt = c1(xt)
|
39 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
40 |
+
xt = c2(xt)
|
41 |
+
x = xt + x
|
42 |
+
return x
|
43 |
+
|
44 |
+
def remove_weight_norm(self):
|
45 |
+
for l in self.convs1:
|
46 |
+
remove_weight_norm(l)
|
47 |
+
for l in self.convs2:
|
48 |
+
remove_weight_norm(l)
|
49 |
+
|
50 |
+
|
51 |
+
class ResBlock2(torch.nn.Module):
|
52 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
53 |
+
super(ResBlock2, self).__init__()
|
54 |
+
self.h = h
|
55 |
+
self.convs = nn.ModuleList([
|
56 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
57 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
58 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
59 |
+
padding=get_padding(kernel_size, dilation[1])))
|
60 |
+
])
|
61 |
+
self.convs.apply(init_weights)
|
62 |
+
|
63 |
+
def forward(self, x):
|
64 |
+
for c in self.convs:
|
65 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
66 |
+
xt = c(xt)
|
67 |
+
x = xt + x
|
68 |
+
return x
|
69 |
+
|
70 |
+
def remove_weight_norm(self):
|
71 |
+
for l in self.convs:
|
72 |
+
remove_weight_norm(l)
|
73 |
+
|
74 |
+
|
75 |
+
class Generator(torch.nn.Module):
|
76 |
+
def __init__(self, h):
|
77 |
+
super(Generator, self).__init__()
|
78 |
+
self.h = h
|
79 |
+
self.num_kernels = len(h.resblock_kernel_sizes)
|
80 |
+
self.num_upsamples = len(h.upsample_rates)
|
81 |
+
self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
|
82 |
+
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
|
83 |
+
|
84 |
+
self.ups = nn.ModuleList()
|
85 |
+
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
86 |
+
self.ups.append(weight_norm(
|
87 |
+
ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
|
88 |
+
k, u, padding=(k-u)//2)))
|
89 |
+
|
90 |
+
self.resblocks = nn.ModuleList()
|
91 |
+
for i in range(len(self.ups)):
|
92 |
+
ch = h.upsample_initial_channel//(2**(i+1))
|
93 |
+
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
94 |
+
self.resblocks.append(resblock(h, ch, k, d))
|
95 |
+
|
96 |
+
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
97 |
+
self.ups.apply(init_weights)
|
98 |
+
self.conv_post.apply(init_weights)
|
99 |
+
|
100 |
+
def forward(self, x):
|
101 |
+
x = self.conv_pre(x)
|
102 |
+
for i in range(self.num_upsamples):
|
103 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
104 |
+
x = self.ups[i](x)
|
105 |
+
xs = None
|
106 |
+
for j in range(self.num_kernels):
|
107 |
+
if xs is None:
|
108 |
+
xs = self.resblocks[i*self.num_kernels+j](x)
|
109 |
+
else:
|
110 |
+
xs += self.resblocks[i*self.num_kernels+j](x)
|
111 |
+
x = xs / self.num_kernels
|
112 |
+
x = F.leaky_relu(x)
|
113 |
+
x = self.conv_post(x)
|
114 |
+
x = torch.tanh(x)
|
115 |
+
|
116 |
+
return x
|
117 |
+
|
118 |
+
def remove_weight_norm(self):
|
119 |
+
print('Removing weight norm...')
|
120 |
+
for l in self.ups:
|
121 |
+
remove_weight_norm(l)
|
122 |
+
for l in self.resblocks:
|
123 |
+
l.remove_weight_norm()
|
124 |
+
remove_weight_norm(self.conv_pre)
|
125 |
+
remove_weight_norm(self.conv_post)
|
126 |
+
|
127 |
+
|
128 |
+
class DiscriminatorP(torch.nn.Module):
|
129 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
130 |
+
super(DiscriminatorP, self).__init__()
|
131 |
+
self.period = period
|
132 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
133 |
+
self.convs = nn.ModuleList([
|
134 |
+
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
135 |
+
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
136 |
+
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
137 |
+
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
138 |
+
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
139 |
+
])
|
140 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
141 |
+
|
142 |
+
def forward(self, x):
|
143 |
+
fmap = []
|
144 |
+
|
145 |
+
# 1d to 2d
|
146 |
+
b, c, t = x.shape
|
147 |
+
if t % self.period != 0: # pad first
|
148 |
+
n_pad = self.period - (t % self.period)
|
149 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
150 |
+
t = t + n_pad
|
151 |
+
x = x.view(b, c, t // self.period, self.period)
|
152 |
+
|
153 |
+
for l in self.convs:
|
154 |
+
x = l(x)
|
155 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
156 |
+
fmap.append(x)
|
157 |
+
x = self.conv_post(x)
|
158 |
+
fmap.append(x)
|
159 |
+
x = torch.flatten(x, 1, -1)
|
160 |
+
|
161 |
+
return x, fmap
|
162 |
+
|
163 |
+
|
164 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
165 |
+
def __init__(self):
|
166 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
167 |
+
self.discriminators = nn.ModuleList([
|
168 |
+
DiscriminatorP(2),
|
169 |
+
DiscriminatorP(3),
|
170 |
+
DiscriminatorP(5),
|
171 |
+
DiscriminatorP(7),
|
172 |
+
DiscriminatorP(11),
|
173 |
+
])
|
174 |
+
|
175 |
+
def forward(self, y, y_hat):
|
176 |
+
y_d_rs = []
|
177 |
+
y_d_gs = []
|
178 |
+
fmap_rs = []
|
179 |
+
fmap_gs = []
|
180 |
+
for i, d in enumerate(self.discriminators):
|
181 |
+
y_d_r, fmap_r = d(y)
|
182 |
+
y_d_g, fmap_g = d(y_hat)
|
183 |
+
y_d_rs.append(y_d_r)
|
184 |
+
fmap_rs.append(fmap_r)
|
185 |
+
y_d_gs.append(y_d_g)
|
186 |
+
fmap_gs.append(fmap_g)
|
187 |
+
|
188 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
189 |
+
|
190 |
+
|
191 |
+
class DiscriminatorS(torch.nn.Module):
|
192 |
+
def __init__(self, use_spectral_norm=False):
|
193 |
+
super(DiscriminatorS, self).__init__()
|
194 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
195 |
+
self.convs = nn.ModuleList([
|
196 |
+
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
197 |
+
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
198 |
+
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
199 |
+
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
200 |
+
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
201 |
+
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
202 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
203 |
+
])
|
204 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
205 |
+
|
206 |
+
def forward(self, x):
|
207 |
+
fmap = []
|
208 |
+
for l in self.convs:
|
209 |
+
x = l(x)
|
210 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
211 |
+
fmap.append(x)
|
212 |
+
x = self.conv_post(x)
|
213 |
+
fmap.append(x)
|
214 |
+
x = torch.flatten(x, 1, -1)
|
215 |
+
|
216 |
+
return x, fmap
|
217 |
+
|
218 |
+
|
219 |
+
class MultiScaleDiscriminator(torch.nn.Module):
|
220 |
+
def __init__(self):
|
221 |
+
super(MultiScaleDiscriminator, self).__init__()
|
222 |
+
self.discriminators = nn.ModuleList([
|
223 |
+
DiscriminatorS(use_spectral_norm=True),
|
224 |
+
DiscriminatorS(),
|
225 |
+
DiscriminatorS(),
|
226 |
+
])
|
227 |
+
self.meanpools = nn.ModuleList([
|
228 |
+
AvgPool1d(4, 2, padding=2),
|
229 |
+
AvgPool1d(4, 2, padding=2)
|
230 |
+
])
|
231 |
+
|
232 |
+
def forward(self, y, y_hat):
|
233 |
+
y_d_rs = []
|
234 |
+
y_d_gs = []
|
235 |
+
fmap_rs = []
|
236 |
+
fmap_gs = []
|
237 |
+
for i, d in enumerate(self.discriminators):
|
238 |
+
if i != 0:
|
239 |
+
y = self.meanpools[i-1](y)
|
240 |
+
y_hat = self.meanpools[i-1](y_hat)
|
241 |
+
y_d_r, fmap_r = d(y)
|
242 |
+
y_d_g, fmap_g = d(y_hat)
|
243 |
+
y_d_rs.append(y_d_r)
|
244 |
+
fmap_rs.append(fmap_r)
|
245 |
+
y_d_gs.append(y_d_g)
|
246 |
+
fmap_gs.append(fmap_g)
|
247 |
+
|
248 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
249 |
+
|
250 |
+
|
251 |
+
def feature_loss(fmap_r, fmap_g):
|
252 |
+
loss = 0
|
253 |
+
for dr, dg in zip(fmap_r, fmap_g):
|
254 |
+
for rl, gl in zip(dr, dg):
|
255 |
+
loss += torch.mean(torch.abs(rl - gl))
|
256 |
+
|
257 |
+
return loss*2
|
258 |
+
|
259 |
+
|
260 |
+
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
261 |
+
loss = 0
|
262 |
+
r_losses = []
|
263 |
+
g_losses = []
|
264 |
+
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
265 |
+
r_loss = torch.mean((1-dr)**2)
|
266 |
+
g_loss = torch.mean(dg**2)
|
267 |
+
loss += (r_loss + g_loss)
|
268 |
+
r_losses.append(r_loss.item())
|
269 |
+
g_losses.append(g_loss.item())
|
270 |
+
|
271 |
+
return loss, r_losses, g_losses
|
272 |
+
|
273 |
+
|
274 |
+
def generator_loss(disc_outputs):
|
275 |
+
loss = 0
|
276 |
+
gen_losses = []
|
277 |
+
for dg in disc_outputs:
|
278 |
+
l = torch.mean((1-dg)**2)
|
279 |
+
gen_losses.append(l)
|
280 |
+
loss += l
|
281 |
+
|
282 |
+
return loss, gen_losses
|
283 |
+
|
hifigan/requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==1.4.0
|
2 |
+
numpy==1.17.4
|
3 |
+
librosa==0.7.2
|
4 |
+
scipy==1.4.1
|
5 |
+
tensorboard==2.0
|
6 |
+
soundfile==0.10.3.post1
|
7 |
+
matplotlib==3.1.3
|
hifigan/test_fs2_speed.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from espnet2.bin.tts_inference import Text2Speech
|
2 |
+
import time
|
3 |
+
|
4 |
+
for device in ("cpu", "cuda"):
|
5 |
+
print(f"loading model in {device}")
|
6 |
+
text2speech = Text2Speech(train_config="/speech/arun/tts/hifigan/config.yaml",model_file="/var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/train.loss.ave.pth",device=device)
|
7 |
+
for i in range(5):
|
8 |
+
print("Run ",i)
|
9 |
+
st = time.time()
|
10 |
+
out = text2speech("EटA sटarakcars औr Elgoridam par pAठyakram par pahlE wyAखyAn mEq")
|
11 |
+
et = time.time()
|
12 |
+
elapsed = (et-st)
|
13 |
+
print("Elapsed time:", elapsed)
|
14 |
+
print("-----------------------------")
|
hifigan/test_hifigan_speed.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from models import Generator
|
2 |
+
from scipy.io.wavfile import write
|
3 |
+
from meldataset import MAX_WAV_VALUE
|
4 |
+
import numpy as np
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
from env import AttrDict
|
8 |
+
import torch
|
9 |
+
import time
|
10 |
+
|
11 |
+
for dev in ("cpu", "cuda"):
|
12 |
+
print(f"loading model in {dev}")
|
13 |
+
device=torch.device(dev)
|
14 |
+
y1 = torch.load("/speech/arun/tts/hifigan/denorm/test_243.npy.pt", map_location=device)
|
15 |
+
y2 = torch.concat([y1]*5, dim=1)
|
16 |
+
y3 = torch.concat([y1]*10, dim=1)
|
17 |
+
|
18 |
+
config_file = os.path.join('/speech/arun/tts/hifigan/cp_hifigan/config.json')
|
19 |
+
with open(config_file) as f:
|
20 |
+
data = f.read()
|
21 |
+
json_config = json.loads(data)
|
22 |
+
h = AttrDict(json_config)
|
23 |
+
torch.manual_seed(h.seed)
|
24 |
+
generator = Generator(h).to(device)
|
25 |
+
state_dict_g = torch.load("/speech/arun/tts/hifigan/cp_hifigan/g_00120000", device)
|
26 |
+
generator.load_state_dict(state_dict_g['generator'])
|
27 |
+
generator.eval()
|
28 |
+
generator.remove_weight_norm()
|
29 |
+
for i in range(3):
|
30 |
+
print("Run ",i)
|
31 |
+
for x in [y1, y2, y3]:
|
32 |
+
with torch.no_grad():
|
33 |
+
st = time.time()
|
34 |
+
y_g_hat = generator(x)
|
35 |
+
audio = y_g_hat.squeeze()
|
36 |
+
audio = audio * MAX_WAV_VALUE
|
37 |
+
audio = audio.cpu().numpy().astype('int16')
|
38 |
+
output_file = "gen.wav"
|
39 |
+
write(output_file, h.sampling_rate, audio)
|
40 |
+
et = time.time()
|
41 |
+
elapsed = (et-st)
|
42 |
+
print("Elapsed time:", elapsed)
|
hifigan/test_tts_speed.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from models import Generator
|
2 |
+
from scipy.io.wavfile import write
|
3 |
+
from meldataset import MAX_WAV_VALUE
|
4 |
+
import numpy as np
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
from env import AttrDict
|
8 |
+
import torch
|
9 |
+
import time
|
10 |
+
from espnet2.bin.tts_inference import Text2Speech
|
11 |
+
|
12 |
+
for dev in ("cpu", "cuda"):
|
13 |
+
print(f"loading model in {dev}")
|
14 |
+
device=torch.device(dev)
|
15 |
+
y1 = torch.load("/speech/arun/tts/hifigan/denorm/test_243.npy.pt", map_location=device)
|
16 |
+
y2 = torch.concat([y1]*5, dim=1)
|
17 |
+
y3 = torch.concat([y1]*10, dim=1)
|
18 |
+
|
19 |
+
config_file = os.path.join('/speech/arun/tts/hifigan/cp_hifigan/config.json')
|
20 |
+
with open(config_file) as f:
|
21 |
+
data = f.read()
|
22 |
+
json_config = json.loads(data)
|
23 |
+
h = AttrDict(json_config)
|
24 |
+
torch.manual_seed(h.seed)
|
25 |
+
generator = Generator(h).to(device)
|
26 |
+
state_dict_g = torch.load("/speech/arun/tts/hifigan/cp_hifigan/g_00120000", device)
|
27 |
+
generator.load_state_dict(state_dict_g['generator'])
|
28 |
+
generator.eval()
|
29 |
+
generator.remove_weight_norm()
|
30 |
+
text2speech = Text2Speech(train_config="/speech/arun/tts/hifigan/config.yaml",model_file="/var/www/html/IITM_TTS/E2E_TTS_FS2/fastspeech2/models/Hindi_male/train.loss.ave.pth",device=dev)
|
31 |
+
for i in range(3):
|
32 |
+
print("Run ",i)
|
33 |
+
with torch.no_grad():
|
34 |
+
st = time.time()
|
35 |
+
out = text2speech("EटA sटarakcars औr Elgoridam par pAठyakram par pahlE wyAखyAn mEq")
|
36 |
+
x = out["feat_gen_denorm"].T.unsqueeze(0).to(device)
|
37 |
+
y_g_hat = generator(x)
|
38 |
+
audio = y_g_hat.squeeze()
|
39 |
+
audio = audio * MAX_WAV_VALUE
|
40 |
+
audio = audio.cpu().numpy().astype('int16')
|
41 |
+
output_file = "gen.wav"
|
42 |
+
write(output_file, h.sampling_rate, audio)
|
43 |
+
et = time.time()
|
44 |
+
elapsed = (et-st)
|
45 |
+
print("Elapsed time:", elapsed)
|