KevinGeng commited on
Commit
fabced5
1 Parent(s): 40157ab
Files changed (1) hide show
  1. app.py +266 -0
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Checking
5
+ + [ ] Better saving directory
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import jiwer
10
+ import pdb
11
+ import torch.nn as nn
12
+ import torch
13
+ import torchaudio
14
+ from transformers import pipeline
15
+ from time import process_time, time
16
+ from pathlib import Path
17
+
18
+ # local import
19
+ import sys
20
+ from espnet2.bin.tts_inference import Text2Speech
21
+
22
+ # pdb.set_trace()
23
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
+
25
+ sys.path.append("src")
26
+
27
+ import gradio as gr
28
+
29
+ # ASR part
30
+
31
+ audio_files = [
32
+ str(x)
33
+ for x in sorted(
34
+ Path(
35
+ "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
36
+ ).glob("**/*wav")
37
+ )
38
+ ]
39
+ # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
40
+ transcriber = pipeline(
41
+ "automatic-speech-recognition",
42
+ model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
43
+ )
44
+ # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
45
+ # 【Female】kan-bayashi ljspeech parallel wavegan
46
+ # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
47
+ # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
48
+ # pdb.set_trace()
49
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
50
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
51
+
52
+ # @title English multi-speaker pretrained model { run: "auto" }
53
+ lang = "English"
54
+ tag = "kan-bayashi/libritts_xvector_vits"
55
+ # vits needs no
56
+ vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
57
+ from espnet2.bin.tts_inference import Text2Speech
58
+ from espnet2.utils.types import str_or_none
59
+
60
+ text2speech = Text2Speech.from_pretrained(
61
+ model_tag=str_or_none(tag),
62
+ vocoder_tag=str_or_none(vocoder_tag),
63
+ device="cuda",
64
+ use_att_constraint=False,
65
+ backward_window=1,
66
+ forward_window=3,
67
+ speed_control_alpha=1.0,
68
+ )
69
+
70
+ import glob
71
+ import os
72
+ import numpy as np
73
+ import kaldiio
74
+
75
+ # Get model directory path
76
+ from espnet_model_zoo.downloader import ModelDownloader
77
+
78
+ d = ModelDownloader()
79
+ model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
80
+
81
+ # Speaker x-vector selection
82
+
83
+ xvector_ark = [
84
+ p
85
+ for p in glob.glob(
86
+ f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True
87
+ )
88
+ if "tr" in p
89
+ ][0]
90
+ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
91
+ spks = list(xvectors.keys())
92
+
93
+ male_spks = {
94
+ "M1": "2300_131720",
95
+ "M2": "1320_122612",
96
+ }
97
+ # "M3": "1188_133604",
98
+ # "M4": "61_70970",
99
+ female_spks = {"F1": "2961_961", "F2": "8463_287645", }
100
+ # "F3": "121_121726"
101
+ spks = dict(male_spks, **female_spks)
102
+ spk_names = sorted(spks.keys())
103
+
104
+
105
+ ## 20230224 Mousa: No reference,
106
+ def ASRTTS(audio_file, spk_name, ref_text=""):
107
+ spk = spks[spk_name]
108
+ spembs = xvectors[spk]
109
+ if ref_text == "":
110
+ reg_text = transcriber(audio_file)["text"]
111
+ else:
112
+ reg_text = ref_text
113
+
114
+ speech, sr = torchaudio.load(
115
+ audio_file, channels_first=True
116
+ ) # Mono channel
117
+ wav_tensor_spembs = text2speech(
118
+ text=reg_text, speech=speech, spembs=spembs
119
+ )["wav"]
120
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
121
+ sample_rate = 22050
122
+ save_id = (
123
+ "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
124
+ )
125
+ torchaudio.save(
126
+ save_id,
127
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
128
+ sample_rate=22050,
129
+ )
130
+
131
+ return save_id, reg_text
132
+
133
+
134
+ def ASRTTS_clean(audio_file, spk_name):
135
+ spk = spks[spk_name]
136
+ spembs = xvectors[spk]
137
+
138
+ reg_text = transcriber(audio_file)["text"]
139
+
140
+ speech, sr = torchaudio.load(
141
+ audio_file, channels_first=True
142
+ ) # Mono channel
143
+ wav_tensor_spembs = text2speech(
144
+ text=reg_text, speech=speech, spembs=spembs
145
+ )["wav"]
146
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
147
+ sample_rate = 22050
148
+ save_id = (
149
+ "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
150
+ )
151
+ torchaudio.save(
152
+ save_id,
153
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
154
+ sample_rate=22050,
155
+ )
156
+ return save_id
157
+
158
+
159
+ # def ref_reg_callback(audio_file, spk_name, ref_text):
160
+ # reg_text = ref_text
161
+ # return audio_file, spk_name, reg_text
162
+
163
+ reference_textbox = gr.Textbox(
164
+ value="",
165
+ placeholder="Input reference here",
166
+ label="Reference",
167
+ )
168
+
169
+ recognization_textbox = gr.Textbox(
170
+ value="",
171
+ placeholder="Output recognization here",
172
+ label="recognization_textbox",
173
+ )
174
+
175
+ speaker_option = gr.Radio(choices=spk_names, label="Speaker")
176
+ # speaker_profiles = {
177
+ # "Male_1": "speaker_icons/male1.png",
178
+ # "Male_2": "speaker_icons/male2.png",
179
+ # "Female_1": "speaker_icons/female1.png",
180
+ # "Female_2": "speaker_icons/female2.png",
181
+ # }
182
+
183
+ # speaker_option = gr.Image(label="Choose your speaker profile",
184
+ # image_mode="RGB",
185
+ # options=speaker_profiles
186
+ # )
187
+
188
+ input_audio = gr.Audio(
189
+ source="upload", type="filepath", label="Audio_to_Evaluate"
190
+ )
191
+ output_audio = gr.Audio(
192
+ source="upload", file="filepath", label="Synthesized Audio"
193
+ )
194
+ examples = [
195
+ ["./samples/001.wav", "M1", ""],
196
+ ["./samples/002.wav", "M2", ""],
197
+ ["./samples/003.wav", "F1", ""],
198
+ ["./samples/004.wav", "F2", ""],
199
+ ]
200
+
201
+
202
+ def change_audiobox(choice):
203
+ if choice == "upload":
204
+ input_audio = gr.Audio.update(source="upload", visible=True)
205
+ elif choice == "microphone":
206
+ input_audio = gr.Audio.update(source="microphone", visible=True)
207
+ else:
208
+ input_audio = gr.Audio.update(visible=False)
209
+ return input_audio
210
+
211
+
212
+ def show_icon(choice):
213
+ if choice == "M1":
214
+ spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
215
+ elif choice == "M2":
216
+ spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
217
+ elif choice == "F1":
218
+ spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
219
+ elif choice == "F2":
220
+ spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
221
+ return spk_icon
222
+
223
+
224
+ with gr.Blocks(
225
+ analytics_enabled=False,
226
+ css=".gradio-container {background-color: #78BD91}",
227
+ ) as demo:
228
+ with gr.Column():
229
+ input_format = gr.Radio(
230
+ choices=["upload", "microphone"], label="Choose your input format"
231
+ )
232
+ input_audio = gr.Audio(
233
+ source="upload",
234
+ type="filepath",
235
+ label="Input Audio",
236
+ interactive=True,
237
+ visible=False,
238
+ )
239
+ input_format.change(
240
+ fn=change_audiobox, inputs=input_format, outputs=input_audio
241
+ )
242
+
243
+ speaker_option = gr.Radio(choices=spk_names, value="M1", label="Choose your target speaker")
244
+ spk_icon = gr.Image(value="speaker_icons/male1.png",
245
+ type="filepath",
246
+ image_mode="RGB",
247
+ source="upload",
248
+ shape=[50, 50],
249
+ interactive=True,
250
+ visible=True)
251
+ speaker_option.change(
252
+ fn=show_icon, inputs=speaker_option, outputs=spk_icon
253
+ )
254
+
255
+ b2 = gr.Button("Convert")
256
+
257
+ output_audio = gr.Audio(
258
+ source="upload", file="filepath", label="Synthesized Audio"
259
+ )
260
+ b2.click(
261
+ ASRTTS_clean,
262
+ inputs=[input_audio, speaker_option],
263
+ outputs=output_audio,
264
+ )
265
+
266
+ demo.launch(share=True)