AbeShinzo0708 commited on
Commit
84346f8
·
1 Parent(s): 4bae4d2

Upload 11 files

Browse files
100epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bae8667fd213c3b5a093e8122fb93e0a2d7dc3e8a502543c97940354bdaea10
3
+ size 373275392
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: AI Kishida Fumio Speaker
3
- emoji: 🐢
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: streamlit
7
- sdk_version: 1.27.2
8
  app_file: app.py
9
  pinned: false
10
  license: openrail
 
1
  ---
2
+ title: AI岸田文雄メーカー
3
+ emoji: 🔥
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: streamlit
7
+ sdk_version: 1.27.0
8
  app_file: app.py
9
  pinned: false
10
  license: openrail
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import streamlit as st
4
+ import numpy as np
5
+ import torch
6
+ from espnet2.bin.tts_inference import Text2Speech
7
+ from scipy.io.wavfile import write
8
+ from PIL import Image
9
+
10
+
11
+ fs, lang = 44100, "Japanese"
12
+ model= "./100epoch.pth"
13
+ x = "これはテストメッセージです"
14
+
15
+ text2speech = Text2Speech.from_pretrained(
16
+ model_file=model,
17
+ device="cpu",
18
+ speed_control_alpha=1.0,
19
+ noise_scale=0.333,
20
+ noise_scale_dur=0.333,
21
+ )
22
+ pause = np.zeros(30000, dtype=np.float32)
23
+
24
+ st.title("おしゃべりAI岸田文雄メーカー")
25
+ image = Image.open('kishida.jpg')
26
+ st.image(image)
27
+ text = st.text_area(label='ここにテキストを入力 (Input Text)↓', height=100, max_chars=2048)
28
+
29
+
30
+ if st.button("生成(Generate)"):
31
+ with torch.no_grad():
32
+ wav = text2speech(text)["wav"]
33
+
34
+ wav_list = []
35
+ wav_list.append(np.concatenate([wav.view(-1).cpu().numpy(), pause]))
36
+ final_wav = np.concatenate(wav_list)
37
+ st.audio(final_wav, sample_rate=fs)
config.yaml ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/finetune_full_band_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/tts_full_band_vits
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 4
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 100
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param:
65
+ - downloads/full_band_vits_accent_with_pause_pretrain/exp/tts_train_full_band_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: 1000
69
+ batch_size: 20
70
+ valid_batch_size: null
71
+ batch_bins: 100000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
75
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
76
+ valid_shape_file:
77
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
78
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
79
+ batch_type: numel
80
+ valid_batch_type: null
81
+ fold_length:
82
+ - 150
83
+ - 409600
84
+ sort_in_batch: descending
85
+ shuffle_within_batch: false
86
+ sort_batch: descending
87
+ multiple_iterator: false
88
+ chunk_length: 500
89
+ chunk_shift_ratio: 0.5
90
+ num_cache_chunks: 1024
91
+ chunk_excluded_key_prefixes: []
92
+ train_data_path_and_name_and_type:
93
+ - - dump/44k/raw/tr_no_dev/text
94
+ - text
95
+ - text
96
+ - - dump/44k/raw/tr_no_dev/wav.scp
97
+ - speech
98
+ - sound
99
+ valid_data_path_and_name_and_type:
100
+ - - dump/44k/raw/dev/text
101
+ - text
102
+ - text
103
+ - - dump/44k/raw/dev/wav.scp
104
+ - speech
105
+ - sound
106
+ allow_variable_data_keys: false
107
+ max_cache_size: 0.0
108
+ max_cache_fd: 32
109
+ valid_max_cache_size: null
110
+ exclude_weight_decay: false
111
+ exclude_weight_decay_conf: {}
112
+ optim: adamw
113
+ optim_conf:
114
+ lr: 0.0001
115
+ betas:
116
+ - 0.8
117
+ - 0.99
118
+ eps: 1.0e-09
119
+ weight_decay: 0.0
120
+ scheduler: exponentiallr
121
+ scheduler_conf:
122
+ gamma: 0.999875
123
+ optim2: adamw
124
+ optim2_conf:
125
+ lr: 0.0001
126
+ betas:
127
+ - 0.8
128
+ - 0.99
129
+ eps: 1.0e-09
130
+ weight_decay: 0.0
131
+ scheduler2: exponentiallr
132
+ scheduler2_conf:
133
+ gamma: 0.999875
134
+ generator_first: false
135
+ token_list:
136
+ - <blank>
137
+ - <unk>
138
+ - '1'
139
+ - '2'
140
+ - '0'
141
+ - '3'
142
+ - '4'
143
+ - '-1'
144
+ - '5'
145
+ - a
146
+ - o
147
+ - '-2'
148
+ - i
149
+ - '-3'
150
+ - u
151
+ - e
152
+ - k
153
+ - n
154
+ - t
155
+ - '6'
156
+ - r
157
+ - '-4'
158
+ - s
159
+ - N
160
+ - m
161
+ - pau
162
+ - '7'
163
+ - sh
164
+ - d
165
+ - g
166
+ - w
167
+ - '8'
168
+ - U
169
+ - '-5'
170
+ - I
171
+ - cl
172
+ - h
173
+ - y
174
+ - b
175
+ - '9'
176
+ - j
177
+ - ts
178
+ - ch
179
+ - '-6'
180
+ - z
181
+ - p
182
+ - '-7'
183
+ - f
184
+ - ky
185
+ - ry
186
+ - '-8'
187
+ - gy
188
+ - '-9'
189
+ - hy
190
+ - ny
191
+ - '-10'
192
+ - by
193
+ - my
194
+ - '-11'
195
+ - '-12'
196
+ - '-13'
197
+ - py
198
+ - '-14'
199
+ - '-15'
200
+ - v
201
+ - '10'
202
+ - '-16'
203
+ - '-17'
204
+ - '11'
205
+ - '-21'
206
+ - '-20'
207
+ - '12'
208
+ - '-19'
209
+ - '13'
210
+ - '-18'
211
+ - '14'
212
+ - dy
213
+ - '15'
214
+ - ty
215
+ - '-22'
216
+ - '16'
217
+ - '18'
218
+ - '19'
219
+ - '17'
220
+ - <sos/eos>
221
+ odim: null
222
+ model_conf: {}
223
+ use_preprocessor: true
224
+ token_type: phn
225
+ bpemodel: null
226
+ non_linguistic_symbols: null
227
+ cleaner: jaconv
228
+ g2p: pyopenjtalk_accent_with_pause
229
+ feats_extract: linear_spectrogram
230
+ feats_extract_conf:
231
+ n_fft: 2048
232
+ hop_length: 512
233
+ win_length: null
234
+ normalize: null
235
+ normalize_conf: {}
236
+ tts: vits
237
+ tts_conf:
238
+ generator_type: vits_generator
239
+ generator_params:
240
+ hidden_channels: 192
241
+ spks: -1
242
+ global_channels: -1
243
+ segment_size: 32
244
+ text_encoder_attention_heads: 2
245
+ text_encoder_ffn_expand: 4
246
+ text_encoder_blocks: 6
247
+ text_encoder_positionwise_layer_type: conv1d
248
+ text_encoder_positionwise_conv_kernel_size: 3
249
+ text_encoder_positional_encoding_layer_type: rel_pos
250
+ text_encoder_self_attention_layer_type: rel_selfattn
251
+ text_encoder_activation_type: swish
252
+ text_encoder_normalize_before: true
253
+ text_encoder_dropout_rate: 0.1
254
+ text_encoder_positional_dropout_rate: 0.0
255
+ text_encoder_attention_dropout_rate: 0.1
256
+ use_macaron_style_in_text_encoder: true
257
+ use_conformer_conv_in_text_encoder: false
258
+ text_encoder_conformer_kernel_size: -1
259
+ decoder_kernel_size: 7
260
+ decoder_channels: 512
261
+ decoder_upsample_scales:
262
+ - 8
263
+ - 8
264
+ - 2
265
+ - 2
266
+ - 2
267
+ decoder_upsample_kernel_sizes:
268
+ - 16
269
+ - 16
270
+ - 4
271
+ - 4
272
+ - 4
273
+ decoder_resblock_kernel_sizes:
274
+ - 3
275
+ - 7
276
+ - 11
277
+ decoder_resblock_dilations:
278
+ - - 1
279
+ - 3
280
+ - 5
281
+ - - 1
282
+ - 3
283
+ - 5
284
+ - - 1
285
+ - 3
286
+ - 5
287
+ use_weight_norm_in_decoder: true
288
+ posterior_encoder_kernel_size: 5
289
+ posterior_encoder_layers: 16
290
+ posterior_encoder_stacks: 1
291
+ posterior_encoder_base_dilation: 1
292
+ posterior_encoder_dropout_rate: 0.0
293
+ use_weight_norm_in_posterior_encoder: true
294
+ flow_flows: 4
295
+ flow_kernel_size: 5
296
+ flow_base_dilation: 1
297
+ flow_layers: 4
298
+ flow_dropout_rate: 0.0
299
+ use_weight_norm_in_flow: true
300
+ use_only_mean_in_flow: true
301
+ stochastic_duration_predictor_kernel_size: 3
302
+ stochastic_duration_predictor_dropout_rate: 0.5
303
+ stochastic_duration_predictor_flows: 4
304
+ stochastic_duration_predictor_dds_conv_layers: 3
305
+ vocabs: 85
306
+ aux_channels: 1025
307
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
308
+ discriminator_params:
309
+ scales: 1
310
+ scale_downsample_pooling: AvgPool1d
311
+ scale_downsample_pooling_params:
312
+ kernel_size: 4
313
+ stride: 2
314
+ padding: 2
315
+ scale_discriminator_params:
316
+ in_channels: 1
317
+ out_channels: 1
318
+ kernel_sizes:
319
+ - 15
320
+ - 41
321
+ - 5
322
+ - 3
323
+ channels: 128
324
+ max_downsample_channels: 1024
325
+ max_groups: 16
326
+ bias: true
327
+ downsample_scales:
328
+ - 2
329
+ - 2
330
+ - 4
331
+ - 4
332
+ - 1
333
+ nonlinear_activation: LeakyReLU
334
+ nonlinear_activation_params:
335
+ negative_slope: 0.1
336
+ use_weight_norm: true
337
+ use_spectral_norm: false
338
+ follow_official_norm: false
339
+ periods:
340
+ - 2
341
+ - 3
342
+ - 5
343
+ - 7
344
+ - 11
345
+ period_discriminator_params:
346
+ in_channels: 1
347
+ out_channels: 1
348
+ kernel_sizes:
349
+ - 5
350
+ - 3
351
+ channels: 32
352
+ downsample_scales:
353
+ - 3
354
+ - 3
355
+ - 3
356
+ - 3
357
+ - 1
358
+ max_downsample_channels: 1024
359
+ bias: true
360
+ nonlinear_activation: LeakyReLU
361
+ nonlinear_activation_params:
362
+ negative_slope: 0.1
363
+ use_weight_norm: true
364
+ use_spectral_norm: false
365
+ generator_adv_loss_params:
366
+ average_by_discriminators: false
367
+ loss_type: mse
368
+ discriminator_adv_loss_params:
369
+ average_by_discriminators: false
370
+ loss_type: mse
371
+ feat_match_loss_params:
372
+ average_by_discriminators: false
373
+ average_by_layers: false
374
+ include_final_outputs: true
375
+ mel_loss_params:
376
+ fs: 44100
377
+ n_fft: 2048
378
+ hop_length: 512
379
+ win_length: null
380
+ window: hann
381
+ n_mels: 80
382
+ fmin: 0
383
+ fmax: null
384
+ log_base: null
385
+ lambda_adv: 1.0
386
+ lambda_mel: 45.0
387
+ lambda_feat_match: 2.0
388
+ lambda_dur: 1.0
389
+ lambda_kl: 1.0
390
+ sampling_rate: 44100
391
+ cache_generator_outputs: true
392
+ pitch_extract: null
393
+ pitch_extract_conf: {}
394
+ pitch_normalize: null
395
+ pitch_normalize_conf: {}
396
+ energy_extract: null
397
+ energy_extract_conf: {}
398
+ energy_normalize: null
399
+ energy_normalize_conf: {}
400
+ required:
401
+ - output_dir
402
+ - token_list
403
+ version: '202308'
404
+ distributed: false
hooks/hook-espnet.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('espnet')
hooks/hook-jamo.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('jamo')
hooks/hook-librosa.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('librosa')
hooks/hook-streamlit.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('streamlit')
kishida.jpg ADDED
pre-fix/librosa/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/librosa/librosa/issues/1682
2
+
3
+ import lazy_loader as lazy
4
+ from .version import version as __version__
5
+
6
+ _filename = __file__
7
+ if _filename.endswith('.pyc'):
8
+ _filename = _filename[:-1]
9
+
10
+ __getattr__, __dir__, __all__ = lazy.attach_stub(__name__, _filename)
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+
3
+ # Install CPU version of pytorch
4
+ --extra-index-url https://download.pytorch.org/whl/cpu
5
+ torch
6
+
7
+ soundfile
8
+ espnet
9
+ espnet_model_zoo
10
+
11
+ # pyopenjtalk version must be 0.2
12
+ pyopenjtalk-prebuilt==0.2.0
13
+
14
+ # typeguard version must be 2.13.3(latest version python 3.8 is supported)
15
+ typeguard==2.13.3
16
+
17
+ # Use version < 3.7.0 as a workaround, otherwise pyinstallr fails to install some dlls
18
+ # https://github.com/pyinstaller/pyinstaller/pull/7505
19
+ # To visualize audio data
20
+ matplotlib<3.7.0