KevinGeng commited on
Commit
f5460b4
1 Parent(s): bb034a8

Update ASR engine to whisper based

Browse files
.gitignore CHANGED
@@ -1,7 +1,7 @@
1
  flagged
2
- wav
3
  samples
4
- wav
 
5
  wav.bak
6
 
7
  model
 
1
  flagged
 
2
  samples
3
+ wav/*.wav
4
+ wav/**/*.wav
5
  wav.bak
6
 
7
  model
app.py CHANGED
@@ -15,8 +15,7 @@ from pathlib import Path
15
  # local import
16
  import sys
17
  from espnet2.bin.tts_inference import Text2Speech
18
-
19
- # pdb.set_trace()
20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
 
22
  sys.path.append("src")
@@ -34,10 +33,22 @@ audio_files = [
34
  )
35
  ]
36
  # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
37
- transcriber = pipeline(
38
- "automatic-speech-recognition",
39
- model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
40
- )
 
 
 
 
 
 
 
 
 
 
 
 
41
  # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
42
  # 【Female】kan-bayashi ljspeech parallel wavegan
43
  # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
@@ -211,9 +222,6 @@ def download_file(audio_file):
211
  return gr.File(value=audio_file)
212
  # pdb.set_trace()
213
 
214
- # if __name__ == "__main__":
215
- # file_share_app.run(port=3000)
216
-
217
  with gr.Blocks(
218
  analytics_enabled=False,
219
  css=".gradio-container {background-color: #78BD91}",
@@ -249,7 +257,7 @@ with gr.Blocks(
249
  b2 = gr.Button("Convert")
250
 
251
  output_audio = gr.Audio(
252
- source="upload", label="Converted Audio", interactive=False
253
  )
254
 
255
  b2.click(
@@ -258,5 +266,7 @@ with gr.Blocks(
258
  outputs=output_audio,
259
  api_name="convert"
260
  )
 
 
261
 
262
  demo.launch(share=False)
 
15
  # local import
16
  import sys
17
  from espnet2.bin.tts_inference import Text2Speech
18
+ from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
 
19
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
20
 
21
  sys.path.append("src")
 
33
  )
34
  ]
35
  # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
36
+ # transcriber = pipeline(
37
+ # "automatic-speech-recognition",
38
+ # model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
39
+ # )
40
+
41
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
42
+
43
+ processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
44
+
45
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
46
+
47
+ # feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
48
+ # representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
49
+ # tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
50
+
51
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
52
  # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
53
  # 【Female】kan-bayashi ljspeech parallel wavegan
54
  # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
 
222
  return gr.File(value=audio_file)
223
  # pdb.set_trace()
224
 
 
 
 
225
  with gr.Blocks(
226
  analytics_enabled=False,
227
  css=".gradio-container {background-color: #78BD91}",
 
257
  b2 = gr.Button("Convert")
258
 
259
  output_audio = gr.Audio(
260
+ source="upload", file="filepath", label="Converted Audio", interactive=False
261
  )
262
 
263
  b2.click(
 
266
  outputs=output_audio,
267
  api_name="convert"
268
  )
269
+
270
+ # download_file("wav/001_F1_spkembs.wav")
271
 
272
  demo.launch(share=False)
app.ver1.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #TODO:
2
+ # + [x] Load Configuration
3
+ # + [ ] Checking
4
+ # + [ ] Better saving directory
5
+
6
+ from pathlib import Path
7
+ from transformers import pipeline
8
+ import torch.nn as nn
9
+ import torch
10
+ import torchaudio
11
+ import gradio as gr
12
+ import sys
13
+
14
+ # Local imports
15
+ sys.path.append("src")
16
+ from espnet2.bin.tts_inference import Text2Speech
17
+ from espnet2.utils.types import str_or_none
18
+
19
+ # Check if GPU is available
20
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
+
22
+ # ASR part
23
+
24
+ data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
25
+ audio_files = sorted(list(Path(data_path).glob("**/*wav")))
26
+ # audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
27
+
28
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
29
+
30
+ # TTS part
31
+ def load_model(lang, tag, vocoder_tag):
32
+ if lang == "Japanese":
33
+ if tag == "kan-bayashi/ljspeech_parallel_wavegan":
34
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
35
+ elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
36
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
37
+ else:
38
+ raise ValueError(f"Not supported: lang={lang}, tag={tag}")
39
+ vocoder = None if vocoder_tag == "none" else vocoder_tag
40
+ elif lang == "English":
41
+ # VITS needs no vocoder; others do
42
+ if tag == "kan-bayashi/libritts_xvector_vits":
43
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
44
+ vocoder = None
45
+ elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
46
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
47
+ vocoder = "melgan"
48
+ else:
49
+ raise ValueError(f"Not supported: lang={lang}, tag={tag}")
50
+ else:
51
+ raise ValueError(f"Not supported: lang={lang}")
52
+ return tts_model, vocoder
53
+
54
+ tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
55
+ tts_model = tts_model.to(device)
56
+
57
+ vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
58
+
59
+ # Gradio part
60
+ def synthesize(text):
61
+ with torch.no_grad():
62
+ # Text-to-speech
63
+ wav = tts_model(text)[0]
64
+ if vocoder is not None:
65
+ # Apply vocoder
66
+ wav = vocoder.inference(wav)
67
+ # Convert to numpy array
68
+ wav = wav.squeeze().cpu().numpy()
69
+ return wav
70
+
71
+ interface = gr.Interface(synthesize, inputs="text", outputs="audio")
72
+ interface.launch()
app.whisper.fine_tuned.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Checking
5
+ + [ ] Better saving directory
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import torch.nn as nn
10
+ import torch
11
+ import torchaudio
12
+ from transformers import pipeline
13
+ from pathlib import Path
14
+
15
+ # local import
16
+ import sys
17
+ from espnet2.bin.tts_inference import Text2Speech
18
+ from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
19
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
20
+
21
+ sys.path.append("src")
22
+
23
+ import gradio as gr
24
+
25
+ # ASR part
26
+
27
+ audio_files = [
28
+ str(x)
29
+ for x in sorted(
30
+ Path(
31
+ "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
32
+ ).glob("**/*wav")
33
+ )
34
+ ]
35
+ # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
36
+ # transcriber = pipeline(
37
+ # "automatic-speech-recognition",
38
+ # model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
39
+ # )
40
+
41
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
42
+
43
+ processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
44
+
45
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
46
+
47
+ # feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
48
+ # representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
49
+ # tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
50
+
51
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
52
+ # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
53
+ # 【Female】kan-bayashi ljspeech parallel wavegan
54
+ # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
55
+ # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
56
+ # pdb.set_trace()
57
+
58
+ # @title English multi-speaker pretrained model { run: "auto" }
59
+ lang = "English"
60
+ tag = "kan-bayashi/libritts_xvector_vits"
61
+ # vits needs no
62
+ vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
63
+ from espnet2.bin.tts_inference import Text2Speech
64
+ from espnet2.utils.types import str_or_none
65
+
66
+ text2speech = Text2Speech.from_pretrained(
67
+ model_tag=str_or_none(tag),
68
+ vocoder_tag=str_or_none(vocoder_tag),
69
+ device="cuda",
70
+ use_att_constraint=False,
71
+ backward_window=1,
72
+ forward_window=3,
73
+ speed_control_alpha=1.0,
74
+ )
75
+
76
+ import glob
77
+ import os
78
+ import numpy as np
79
+ import kaldiio
80
+
81
+ # Get model directory path
82
+ from espnet_model_zoo.downloader import ModelDownloader
83
+
84
+ d = ModelDownloader()
85
+ model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
86
+
87
+ # Speaker x-vector selection
88
+
89
+ xvector_ark = [
90
+ p
91
+ for p in glob.glob(
92
+ f"xvector/test-clean/spk_xvector.ark", recursive=True
93
+ )
94
+ if "test" in p
95
+ ][0]
96
+ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
97
+ spks = list(xvectors.keys())
98
+
99
+ male_spks = {
100
+ "Male1": "2300_131720",
101
+ "Male2": "1320_122612",
102
+ }
103
+ # "M3": "1188_133604",
104
+ # "M4": "61_70970",
105
+ female_spks = {"Female1": "2961_961", "Female2": "8463_287645", }
106
+ # "F3": "121_121726"
107
+ spks = dict(male_spks, **female_spks)
108
+ spk_names = sorted(spks.keys())
109
+
110
+
111
+ ## 20230224 Mousa: No reference,
112
+ def ASRTTS(audio_file, spk_name, ref_text=""):
113
+ spk = spks[spk_name]
114
+ spembs = xvectors[spk]
115
+ if ref_text == "":
116
+ reg_text = transcriber(audio_file)["text"]
117
+ else:
118
+ reg_text = ref_text
119
+
120
+ speech, sr = torchaudio.load(
121
+ audio_file, channels_first=True
122
+ ) # Mono channel
123
+ wav_tensor_spembs = text2speech(
124
+ text=reg_text, speech=speech, spembs=spembs
125
+ )["wav"]
126
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
127
+ sample_rate = 22050
128
+ save_id = (
129
+ "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
130
+ )
131
+ torchaudio.save(
132
+ save_id,
133
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
134
+ sample_rate=22050,
135
+ )
136
+
137
+ return save_id, reg_text
138
+
139
+
140
+ def ASRTTS_clean(audio_file, spk_name):
141
+ spk = spks[spk_name]
142
+ spembs = xvectors[spk]
143
+
144
+ reg_text = transcriber(audio_file)["text"]
145
+
146
+ speech, sr = torchaudio.load(
147
+ audio_file, channels_first=True
148
+ ) # Mono channel
149
+ wav_tensor_spembs = text2speech(
150
+ text=reg_text, speech=speech, spembs=spembs
151
+ )["wav"]
152
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
153
+ sample_rate = 22050
154
+ save_id = (
155
+ "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
156
+ )
157
+ torchaudio.save(
158
+ save_id,
159
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
160
+ sample_rate=22050,
161
+ )
162
+ return save_id
163
+
164
+
165
+ reference_textbox = gr.Textbox(
166
+ value="",
167
+ placeholder="Input reference here",
168
+ label="Reference",
169
+ )
170
+
171
+ recognization_textbox = gr.Textbox(
172
+ value="",
173
+ placeholder="Output recognization here",
174
+ label="recognization_textbox",
175
+ )
176
+
177
+ speaker_option = gr.Radio(choices=spk_names, label="Speaker")
178
+
179
+ input_audio = gr.Audio(
180
+ source="upload", type="filepath", label="Audio_to_Evaluate"
181
+ )
182
+ output_audio = gr.Audio(
183
+ source="upload", file="filepath", label="Synthesized Audio"
184
+ )
185
+ examples = [
186
+ ["./samples/001.wav", "M1", ""],
187
+ ["./samples/002.wav", "M2", ""],
188
+ ["./samples/003.wav", "F1", ""],
189
+ ["./samples/004.wav", "F2", ""],
190
+ ]
191
+
192
+
193
+ def change_audiobox(choice):
194
+ if choice == "upload":
195
+ input_audio = gr.Audio.update(source="upload", visible=True)
196
+ elif choice == "microphone":
197
+ input_audio = gr.Audio.update(source="microphone", visible=True)
198
+ else:
199
+ input_audio = gr.Audio.update(visible=False)
200
+ return input_audio
201
+
202
+
203
+ def show_icon(choice):
204
+ if choice == "Male1":
205
+ spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
206
+ elif choice == "Male2":
207
+ spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
208
+ elif choice == "Female1":
209
+ spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
210
+ elif choice == "Female2":
211
+ spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
212
+ return spk_icon
213
+
214
+ def get_download_file(audio_file=None):
215
+ if audio_file == None:
216
+ output_audio_file = gr.File.update(visible=False)
217
+ else:
218
+ output_audio_file = gr.File.update(visible=True)
219
+ return output_audio_file
220
+
221
+ def download_file(audio_file):
222
+ return gr.File(value=audio_file)
223
+ # pdb.set_trace()
224
+
225
+ with gr.Blocks(
226
+ analytics_enabled=False,
227
+ css=".gradio-container {background-color: #78BD91}",
228
+ ) as demo:
229
+ with gr.Column(elem_id="Column"):
230
+ input_format = gr.Radio(
231
+ choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
232
+ )
233
+ input_audio = gr.Audio(
234
+ source="microphone",
235
+ type="filepath",
236
+ label="Input Audio",
237
+ interactive=True,
238
+ visible=False,
239
+ elem_id="input_audio"
240
+ )
241
+ input_format.change(
242
+ fn=change_audiobox, inputs=input_format, outputs=input_audio
243
+ )
244
+
245
+ speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
246
+ spk_icon = gr.Image(value="speaker_icons/male1.png",
247
+ type="filepath",
248
+ image_mode="RGB",
249
+ source="upload",
250
+ shape=[50, 50],
251
+ interactive=True,
252
+ visible=True)
253
+ speaker_option.change(
254
+ fn=show_icon, inputs=speaker_option, outputs=spk_icon
255
+ )
256
+
257
+ b2 = gr.Button("Convert")
258
+
259
+ output_audio = gr.Audio(
260
+ source="upload", file="filepath", label="Converted Audio", interactive=False
261
+ )
262
+
263
+ b2.click(
264
+ ASRTTS_clean,
265
+ inputs=[input_audio, speaker_option],
266
+ outputs=output_audio,
267
+ api_name="convert"
268
+ )
269
+
270
+ # download_file("wav/001_F1_spkembs.wav")
271
+
272
+ demo.launch(share=False)
local/ASR_compare.py CHANGED
@@ -44,6 +44,26 @@ transcriber = pipeline(
44
  old_transcriber = pipeline(
45
  "automatic-speech-recognition", "facebook/wav2vec2-base-960h"
46
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
48
  # 【Female】kan-bayashi ljspeech parallel wavegan
49
  # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
@@ -81,7 +101,7 @@ from espnet_model_zoo.downloader import ModelDownloader
81
 
82
  d = ModelDownloader()
83
  model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
84
-
85
  # Speaker x-vector selection
86
 
87
  xvector_ark = [
@@ -92,6 +112,7 @@ xvector_ark = [
92
  if "tr" in p
93
  ][0]
94
  xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
 
95
  spks = list(xvectors.keys())
96
 
97
  male_spks = {
@@ -115,6 +136,25 @@ def ASRnew(audio_file):
115
  reg_text = transcriber(audio_file)["text"]
116
  return reg_text
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # def ref_reg_callback(audio_file, spk_name, ref_text):
120
  # reg_text = ref_text
@@ -190,25 +230,69 @@ with gr.Blocks(
190
 
191
  with gr.Row():
192
  b1 = gr.Button("Conventional Speech Recognition Engine")
193
- old_recognization_textbox = gr.Textbox(
194
  value="",
195
  placeholder="Recognition output",
196
  label="Convertional",
197
  )
198
  b1.click(
199
- ASRold, inputs=[input_audio], outputs=old_recognization_textbox
200
  )
201
 
202
  with gr.Row():
203
- b2 = gr.Button("Laronix Speech Recognition Engine")
204
- new_recognization_textbox = gr.Textbox(
205
  value="",
206
  placeholder="Recognition output",
207
  label="Purposed",
208
  )
209
 
210
  b2.click(
211
- ASRnew, inputs=[input_audio], outputs=new_recognization_textbox
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  )
213
 
214
  demo.launch(share=True)
 
44
  old_transcriber = pipeline(
45
  "automatic-speech-recognition", "facebook/wav2vec2-base-960h"
46
  )
47
+ whisper_transcriber = pipeline(
48
+ "automatic-speech-recognition", "KevinGeng/whipser_medium_en_PAL300_step25"
49
+ )
50
+
51
+ whisper_transcriber_org = pipeline(
52
+ "automatic-speech-recognition", "KevinGeng/whisper-medium-PAL128-25step"
53
+ )
54
+
55
+ whisper_transcriber_Tony = pipeline(
56
+ "automatic-speech-recognition", "KevinGeng/Tony1_AVA_script_conv_train_conv_dev"
57
+ )
58
+
59
+ whisper_transcriber_John = pipeline(
60
+ "automatic-speech-recognition", "KevinGeng/whipser_medium_en_PAL300_step25_step2_VTCK"
61
+ )
62
+
63
+ whisper_transcriber_Negel = pipeline(
64
+ "automatic-speech-recognition", "KevinGeng/Negel_152_AVA_script_conv_train_conv_dev"
65
+ )
66
+
67
  # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
68
  # 【Female】kan-bayashi ljspeech parallel wavegan
69
  # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
 
101
 
102
  d = ModelDownloader()
103
  model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
104
+ pdb.set_trace()
105
  # Speaker x-vector selection
106
 
107
  xvector_ark = [
 
112
  if "tr" in p
113
  ][0]
114
  xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
115
+
116
  spks = list(xvectors.keys())
117
 
118
  male_spks = {
 
136
  reg_text = transcriber(audio_file)["text"]
137
  return reg_text
138
 
139
+ def ASRwhipser_FT(audio_file):
140
+ reg_text = whisper_transcriber(audio_file)["text"]
141
+ return reg_text
142
+
143
+ def ASRwhipser_Org(audio_file):
144
+ reg_text = whisper_transcriber_org(audio_file)["text"]
145
+ return reg_text
146
+
147
+ def ASRwhipser_Tony(audio_file):
148
+ reg_text = whisper_transcriber_Tony(audio_file)["text"]
149
+ return reg_text
150
+
151
+ def ASRwhipser_Negel(audio_file):
152
+ reg_text = whisper_transcriber_Negel(audio_file)["text"]
153
+ return reg_text
154
+
155
+ def ASRwhipser_John(audio_file):
156
+ reg_text = whisper_transcriber_John(audio_file)["text"]
157
+ return reg_text
158
 
159
  # def ref_reg_callback(audio_file, spk_name, ref_text):
160
  # reg_text = ref_text
 
230
 
231
  with gr.Row():
232
  b1 = gr.Button("Conventional Speech Recognition Engine")
233
+ t1 = gr.Textbox(
234
  value="",
235
  placeholder="Recognition output",
236
  label="Convertional",
237
  )
238
  b1.click(
239
+ ASRold, inputs=[input_audio], outputs=t1
240
  )
241
 
242
  with gr.Row():
243
+ b2 = gr.Button("Laronix Speech Recognition Engine (Ver1, wav2vec2.0+CTC)")
244
+ t2 = gr.Textbox(
245
  value="",
246
  placeholder="Recognition output",
247
  label="Purposed",
248
  )
249
 
250
  b2.click(
251
+ ASRnew, inputs=[input_audio], outputs=t2
252
+ )
253
+ with gr.Row():
254
+ b3 = gr.Button("Laronix Speech Recognition Engine (Ver2, Whipser)")
255
+ t3 = gr.Textbox(
256
+ value="",
257
+ placeholder="Recognition output",
258
+ label="Purposed",
259
+ )
260
+
261
+ b3.click(
262
+ ASRwhipser_FT, inputs=[input_audio], outputs=t3
263
+ )
264
+ with gr.Row():
265
+ b4 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with Tony)")
266
+ t4 = gr.Textbox(
267
+ value="",
268
+ placeholder="Recognition output",
269
+ label="Purposed",
270
+ )
271
+
272
+ b4.click(
273
+ ASRwhipser_Tony, inputs=[input_audio], outputs=t4
274
+ )
275
+ with gr.Row():
276
+ b5 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with John)")
277
+ t5 = gr.Textbox(
278
+ value="",
279
+ placeholder="Recognition output",
280
+ label="Purposed",
281
+ )
282
+
283
+ b5.click(
284
+ ASRwhipser_John, inputs=[input_audio], outputs=t5
285
+ )
286
+ with gr.Row():
287
+ b6 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with Negel)")
288
+ t6 = gr.Textbox(
289
+ value="",
290
+ placeholder="Recognition output",
291
+ label="Purposed",
292
+ )
293
+
294
+ b6.click(
295
+ ASRwhipser_Negel, inputs=[input_audio], outputs=t6
296
  )
297
 
298
  demo.launch(share=True)
local/ASR_conpare.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #TODO:
2
+ # + [x] Load Configuration
3
+ # + [ ] Checking
4
+ # + [ ] Better saving directory
5
+
6
+ from pathlib import Path
7
+ from transformers import pipeline
8
+ import torch.nn as nn
9
+ import torch
10
+ import torchaudio
11
+ import gradio as gr
12
+ import sys
13
+
14
+ # Local imports
15
+ sys.path.append("src")
16
+ from espnet2.bin.tts_inference import Text2Speech
17
+ from espnet2.utils.types import str_or_none
18
+
19
+ # Check if GPU is available
20
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
+
22
+ # ASR part
23
+
24
+ data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
25
+ audio_files = sorted(list(Path(data_path).glob("**/*wav")))
26
+ # audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
27
+
28
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
29
+
30
+ # TTS part
31
+ def load_model(lang, tag, vocoder_tag):
32
+ if lang == "Japanese":
33
+ if tag == "kan-bayashi/ljspeech_parallel_wavegan":
34
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
35
+ elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
36
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
37
+ else:
38
+ raise ValueError(f"Not supported: lang={lang}, tag={tag}")
39
+ vocoder = None if vocoder_tag == "none" else vocoder_tag
40
+ elif lang == "English":
41
+ # VITS needs no vocoder; others do
42
+ if tag == "kan-bayashi/libritts_xvector_vits":
43
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
44
+ vocoder = None
45
+ elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
46
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
47
+ vocoder = "melgan"
48
+ else:
49
+ raise ValueError(f"Not supported: lang={lang}, tag={tag}")
50
+ else:
51
+ raise ValueError(f"Not supported: lang={lang}")
52
+ return tts_model, vocoder
53
+
54
+ tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
55
+ tts_model = tts_model.to(device)
56
+
57
+ vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
58
+
59
+ # Gradio part
60
+ def synthesize(text):
61
+ with torch.no_grad():
62
+ # Text-to-speech
63
+ wav = tts_model(text)[0]
64
+ if vocoder is not None:
65
+ # Apply vocoder
66
+ wav = vocoder.inference(wav)
67
+ # Convert to numpy array
68
+ wav = wav.squeeze().cpu().numpy()
69
+ return wav
70
+
71
+ interface = gr.Interface(synthesize, inputs="text", outputs="audio")
72
+ interface.launch()
local/PAL_dataset.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## ADD dataset appendning
2
+ from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
3
+ import pdb
4
+
5
+ import numpy as np
6
+ # to_dataset = load_dataset("KevinGeng/testdataset")
7
+ base_dataset = load_dataset("../laronix_automos/data/Patient_sil_trim_16k_normed_5_snr_40")
8
+ base_extra_dataset = load_dataset("../laronix_automos/data/John_p326_large")
9
+
10
+ PAL_dataset = DatasetDict({"base": base_dataset['train'], "base_extra": base_extra_dataset['train']})
11
+ # PAL_dataset.push_to_hub("KevinGeng/PAL_dataset")
12
+ concatenate_datasets(base_dataset['train'], base_extra_dataset['train'])
13
+ pdb.set_trace()
14
+
15
+ new_record = {"audio":
16
+ {'path': 'Arthur_set1_001_noisy.wav',
17
+ 'array': np.array([0.02526855, 0.04602051, 0.04873657, 0.00045776, 0.00201416, 0.00167847]),
18
+ 'sampling_rate': 16000},
19
+ "transcription": "TOD"}
20
+ pdb.set_trace()
21
+
22
+ import requests
23
+ headers = {"Authorization": f"KevinGeng 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"}
24
+ # headers = {"Authorization": "Haopeng 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"}
25
+ # pdb.set_trace()
26
+ API_URL = "https://datasets-server.huggingface.co/is-valid?dataset=KevinGeng/testdataset"
27
+
28
+ def query():
29
+ response = requests.request("GET", API_URL, headers=headers)
30
+ # pdb.set_trace()
31
+ return response.json()
32
+ data = query()
33
+
34
+ pdb.set_trace()
local/app.genie.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #TODO:
2
+ # + [x] Load Configuration
3
+ # + [ ] Checking
4
+ # + [ ] Better saving directory
5
+
6
+ from pathlib import Path
7
+ from transformers import pipeline
8
+ import torch.nn as nn
9
+ import torch
10
+ import torchaudio
11
+ import gradio as gr
12
+ import sys
13
+
14
+ # Local imports
15
+ sys.path.append("src")
16
+ from espnet2.bin.tts_inference import Text2Speech
17
+ from espnet2.utils.types import str_or_none
18
+
19
+ # Check if GPU is available
20
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
+
22
+ # ASR part
23
+
24
+ data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
25
+ audio_files = sorted(list(Path(data_path).glob("**/*wav")))
26
+ # audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
27
+
28
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
29
+
30
+ # TTS part
31
+ def load_model(lang, tag, vocoder_tag):
32
+ if lang == "Japanese":
33
+ if tag == "kan-bayashi/ljspeech_parallel_wavegan":
34
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
35
+ elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
36
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
37
+ else:
38
+ raise ValueError(f"Not supported: lang={lang}, tag={tag}")
39
+ vocoder = None if vocoder_tag == "none" else vocoder_tag
40
+ elif lang == "English":
41
+ # VITS needs no vocoder; others do
42
+ if tag == "kan-bayashi/libritts_xvector_vits":
43
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
44
+ vocoder = None
45
+ elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
46
+ tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
47
+ vocoder = "melgan"
48
+ else:
49
+ raise ValueError(f"Not supported: lang={lang}, tag={tag}")
50
+ else:
51
+ raise ValueError(f"Not supported: lang={lang}")
52
+ return tts_model, vocoder
53
+
54
+ tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
55
+ import pdb
56
+ pdb.set_trace()
57
+ tts_model = tts_model.to(device)
58
+
59
+ vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
60
+
61
+ # Gradio part
62
+ def synthesize(text):
63
+ with torch.no_grad():
64
+ # Text-to-speech
65
+ wav = tts_model(text)[0]
66
+ if vocoder is not None:
67
+ # Apply vocoder
68
+ wav = vocoder.inference(wav)
69
+ # Convert to numpy array
70
+ wav = wav.squeeze().cpu().numpy()
71
+ return wav
72
+
73
+ interface = gr.Interface(synthesize, inputs="text", outputs="audio")
74
+ interface.launch()
local/app.old.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Checking
5
+ + [ ] Better saving directory
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import jiwer
10
+ import pdb
11
+ import torch.nn as nn
12
+ import torch
13
+ import torchaudio
14
+ from transformers import pipeline
15
+ from time import process_time, time
16
+ from pathlib import Path
17
+ # local import
18
+ import sys
19
+ from espnet2.bin.tts_inference import Text2Speech
20
+ # pdb.set_trace()
21
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
+
23
+ sys.path.append("src")
24
+
25
+ import gradio as gr
26
+
27
+ # ASR part
28
+
29
+ audio_files = [str(x) for x in sorted(Path("/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video").glob("**/*wav"))]
30
+ # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
31
+ # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
32
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
33
+ # 【Female】kan-bayashi ljspeech parallel wavegan
34
+ # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
35
+ # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
36
+ # pdb.set_trace()
37
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
38
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
39
+
40
+ #@title English multi-speaker pretrained model { run: "auto" }
41
+ lang = 'English'
42
+ tag = 'kan-bayashi/libritts_xvector_vits'
43
+ # tag = "kan-bayashi/vctk_multi_spk_vits"
44
+ # vits needs no
45
+ vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
46
+ from espnet2.bin.tts_inference import Text2Speech
47
+ from espnet2.utils.types import str_or_none
48
+
49
+ text2speech = Text2Speech.from_pretrained(
50
+ model_tag=str_or_none(tag),
51
+ vocoder_tag=str_or_none(vocoder_tag),
52
+ device="cuda",
53
+ use_att_constraint=False,
54
+ backward_window=1,
55
+ forward_window=3,
56
+ speed_control_alpha=1.0,
57
+ )
58
+
59
+ import glob
60
+ import os
61
+ import numpy as np
62
+ import kaldiio
63
+
64
+ # Get model directory path
65
+ from espnet_model_zoo.downloader import ModelDownloader
66
+ d = ModelDownloader()
67
+ model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
68
+
69
+ # Speaker x-vector selection
70
+
71
+ xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0]
72
+ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
73
+ import pdb
74
+
75
+ pdb.set_trace()
76
+
77
+ spks = list(xvectors.keys())
78
+
79
+ male_spks = {"M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970"}
80
+ female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
81
+ spks = dict(male_spks, **female_spks)
82
+ spk_names = sorted(spks.keys())
83
+
84
+ def ASRTTS(audio_file, spk_name, ref_text=""):
85
+ spk = spks[spk_name]
86
+ spembs = xvectors[spk]
87
+ if ref_text == "":
88
+ reg_text = transcriber(audio_file)['text']
89
+ else:
90
+ reg_text = ref_text
91
+
92
+ speech, sr = torchaudio.load(audio_file, channels_first=True) # Mono channel
93
+ wav_tensor_spembs = text2speech(text=reg_text, speech=speech, spembs=spembs)["wav"]
94
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
95
+ sample_rate = 22050
96
+ save_id = "./wav/" + Path(audio_file).stem + "_" + spk_name +"_spkembs.wav"
97
+ torchaudio.save(save_id, src=wav_tensor_spembs.unsqueeze(0).to("cpu"), sample_rate=22050)
98
+
99
+ return save_id, reg_text
100
+
101
+ def ref_reg_callback(audio_file, spk_name, ref_text):
102
+ reg_text = ref_text
103
+ return audio_file, spk_name, reg_text
104
+
105
+ reference_textbox = gr.Textbox(
106
+ value="",
107
+ placeholder="Input reference here",
108
+ label="Reference",
109
+ )
110
+
111
+ recognization_textbox = gr.Textbox(
112
+ value="",
113
+ placeholder="Output recognization here",
114
+ label="recognization_textbox",
115
+ )
116
+
117
+ speaker_option = gr.Radio(choices=spk_names, label="Speaker")
118
+
119
+ input_audio = gr.Audio(
120
+ source="microphone",
121
+ type="filepath",
122
+ label="Audio_to_Evaluate"
123
+ )
124
+ output_audio = gr.Audio(
125
+ source="upload",
126
+ file="filepath",
127
+ label="Synthesized Audio"
128
+ )
129
+ examples = [["./samples/001.wav",'M1', ""],
130
+ ["./samples/002.wav",'M2', ""],
131
+ ["./samples/003.wav",'F1', ""],
132
+ ["./samples/004.wav",'F2', ""]]
133
+
134
+ # ASRTTS(*examples[0])
135
+ iface = gr.Interface(
136
+ fn = ASRTTS,
137
+ inputs = [
138
+ input_audio,
139
+ speaker_option,
140
+ reference_textbox,
141
+ ],
142
+ outputs = [
143
+ output_audio,
144
+ recognization_textbox
145
+ ],
146
+ examples = examples
147
+ )
148
+ iface.input_callback = ref_reg_callback
149
+ iface.launch(share=False)
local/app.old.whipser.fined_tuned.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Checking
5
+ + [ ] Better saving directory
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import jiwer
10
+ import pdb
11
+ import torch.nn as nn
12
+ import torch
13
+ import torchaudio
14
+ from transformers import pipeline
15
+ from time import process_time, time
16
+ from pathlib import Path
17
+ # local import
18
+ import sys
19
+ from espnet2.bin.tts_inference import Text2Speech
20
+ # pdb.set_trace()
21
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
+
23
+ sys.path.append("src")
24
+
25
+ import gradio as gr
26
+
27
+ # ASR part
28
+
29
+ audio_files = [str(x) for x in sorted(Path("/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video").glob("**/*wav"))]
30
+ # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
31
+ # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
32
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
33
+ # 【Female】kan-bayashi ljspeech parallel wavegan
34
+ # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
35
+ # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
36
+ # pdb.set_trace()
37
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
38
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
39
+
40
+ #@title English multi-speaker pretrained model { run: "auto" }
41
+ lang = 'English'
42
+ tag = 'kan-bayashi/libritts_xvector_vits'
43
+ # tag = "kan-bayashi/vctk_multi_spk_vits"
44
+ # vits needs no
45
+ vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
46
+ from espnet2.bin.tts_inference import Text2Speech
47
+ from espnet2.utils.types import str_or_none
48
+
49
+ text2speech = Text2Speech.from_pretrained(
50
+ model_tag=str_or_none(tag),
51
+ vocoder_tag=str_or_none(vocoder_tag),
52
+ device="cuda",
53
+ use_att_constraint=False,
54
+ backward_window=1,
55
+ forward_window=3,
56
+ speed_control_alpha=1.0,
57
+ )
58
+
59
+
60
+ import glob
61
+ import os
62
+ import numpy as np
63
+ import kaldiio
64
+
65
+ # Get model directory path
66
+ from espnet_model_zoo.downloader import ModelDownloader
67
+ d = ModelDownloader()
68
+ model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
69
+
70
+ # Speaker x-vector selection
71
+
72
+ xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0]
73
+ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
74
+ spks = list(xvectors.keys())
75
+
76
+ male_spks = {"M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970"}
77
+ female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
78
+ spks = dict(male_spks, **female_spks)
79
+ spk_names = sorted(spks.keys())
80
+
81
+ def ASRTTS(audio_file, spk_name, ref_text=""):
82
+ spk = spks[spk_name]
83
+ spembs = xvectors[spk]
84
+ if ref_text == "":
85
+ reg_text = transcriber(audio_file)['text']
86
+ else:
87
+ reg_text = ref_text
88
+
89
+ speech, sr = torchaudio.load(audio_file, channels_first=True) # Mono channel
90
+ wav_tensor_spembs = text2speech(text=reg_text, speech=speech, spembs=spembs)["wav"]
91
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
92
+ sample_rate = 22050
93
+ save_id = "./wav/" + Path(audio_file).stem + "_" + spk_name +"_spkembs.wav"
94
+ torchaudio.save(save_id, src=wav_tensor_spembs.unsqueeze(0).to("cpu"), sample_rate=22050)
95
+
96
+ return save_id, reg_text
97
+
98
+ def ref_reg_callback(audio_file, spk_name, ref_text):
99
+ reg_text = ref_text
100
+ return audio_file, spk_name, reg_text
101
+
102
+ reference_textbox = gr.Textbox(
103
+ value="",
104
+ placeholder="Input reference here",
105
+ label="Reference",
106
+ )
107
+
108
+ recognization_textbox = gr.Textbox(
109
+ value="",
110
+ placeholder="Output recognization here",
111
+ label="recognization_textbox",
112
+ )
113
+
114
+ speaker_option = gr.Radio(choices=spk_names, label="Speaker")
115
+
116
+ input_audio = gr.Audio(
117
+ source="microphone",
118
+ type="filepath",
119
+ label="Audio_to_Evaluate"
120
+ )
121
+ output_audio = gr.Audio(
122
+ source="upload",
123
+ file="filepath",
124
+ label="Synthesized Audio"
125
+ )
126
+ examples = [["./samples/001.wav",'M1', ""],
127
+ ["./samples/002.wav",'M2', ""],
128
+ ["./samples/003.wav",'F1', ""],
129
+ ["./samples/004.wav",'F2', ""]]
130
+
131
+ # ASRTTS(*examples[0])
132
+ iface = gr.Interface(
133
+ fn = ASRTTS,
134
+ inputs = [
135
+ input_audio,
136
+ speaker_option,
137
+ reference_textbox,
138
+ ],
139
+ outputs = [
140
+ output_audio,
141
+ recognization_textbox
142
+ ],
143
+ examples = examples
144
+ )
145
+ iface.input_callback = ref_reg_callback
146
+ iface.launch(share=False)
local/app.vctk.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Checking
5
+ + [ ] Better saving directory
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import jiwer
10
+ import pdb
11
+ import torch.nn as nn
12
+ import torch
13
+ import torchaudio
14
+ from transformers import pipeline
15
+ from time import process_time, time
16
+ from pathlib import Path
17
+ # local import
18
+ import sys
19
+ from espnet2.bin.tts_inference import Text2Speech
20
+ # pdb.set_trace()
21
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
+
23
+ sys.path.append("src")
24
+
25
+ import gradio as gr
26
+
27
+ # ASR part
28
+
29
+ audio_files = [str(x) for x in sorted(Path("/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video").glob("**/*wav"))]
30
+ # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
31
+ # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
32
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
33
+ # 【Female】kan-bayashi ljspeech parallel wavegan
34
+ # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
35
+ # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
36
+ # pdb.set_trace()
37
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
38
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
39
+
40
+ #@title English multi-speaker pretrained model { run: "auto" }
41
+ lang = 'English'
42
+ tag = 'kan-bayashi/libritts_xvector_vits'
43
+ # tag = "kan-bayashi/vctk_multi_spk_vits"
44
+ # vits needs no
45
+ vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
46
+ from espnet2.bin.tts_inference import Text2Speech
47
+ from espnet2.utils.types import str_or_none
48
+
49
+ text2speech = Text2Speech.from_pretrained(
50
+ model_tag=str_or_none(tag),
51
+ vocoder_tag=str_or_none(vocoder_tag),
52
+ device="cuda",
53
+ use_att_constraint=False,
54
+ backward_window=1,
55
+ forward_window=3,
56
+ speed_control_alpha=1.0,
57
+ )
58
+
59
+
60
+ import glob
61
+ import os
62
+ import numpy as np
63
+ import kaldiio
64
+
65
+ # Get model directory path
66
+ from espnet_model_zoo.downloader import ModelDownloader
67
+ d = ModelDownloader()
68
+ model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
69
+
70
+ # Speaker x-vector selection
71
+
72
+ xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0]
73
+ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
74
+ spks = list(xvectors.keys())
75
+
76
+ male_spks = {"M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970"}
77
+ female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
78
+ spks = dict(male_spks, **female_spks)
79
+ spk_names = sorted(spks.keys())
80
+
81
+ def ASRTTS(audio_file, spk_name, ref_text=""):
82
+ spk = spks[spk_name]
83
+ spembs = xvectors[spk]
84
+ if ref_text == "":
85
+ reg_text = transcriber(audio_file)['text']
86
+ else:
87
+ reg_text = ref_text
88
+
89
+ speech, sr = torchaudio.load(audio_file, channels_first=True) # Mono channel
90
+ wav_tensor_spembs = text2speech(text=reg_text, speech=speech, spembs=spembs)["wav"]
91
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
92
+ sample_rate = 22050
93
+ save_id = "./wav/" + Path(audio_file).stem + "_" + spk_name +"_spkembs.wav"
94
+ torchaudio.save(save_id, src=wav_tensor_spembs.unsqueeze(0).to("cpu"), sample_rate=22050)
95
+
96
+ return save_id, reg_text
97
+
98
+ def ref_reg_callback(audio_file, spk_name, ref_text):
99
+ reg_text = ref_text
100
+ return audio_file, spk_name, reg_text
101
+
102
+ reference_textbox = gr.Textbox(
103
+ value="",
104
+ placeholder="Input reference here",
105
+ label="Reference",
106
+ )
107
+
108
+ recognization_textbox = gr.Textbox(
109
+ value="",
110
+ placeholder="Output recognization here",
111
+ label="recognization_textbox",
112
+ )
113
+
114
+ speaker_option = gr.Radio(choices=spk_names, label="Speaker")
115
+
116
+ input_audio = gr.Audio(
117
+ source="microphone",
118
+ type="filepath",
119
+ label="Audio_to_Evaluate"
120
+ )
121
+ output_audio = gr.Audio(
122
+ source="upload",
123
+ file="filepath",
124
+ label="Synthesized Audio"
125
+ )
126
+ examples = [["./samples/001.wav",'M1', ""],
127
+ ["./samples/002.wav",'M2', ""],
128
+ ["./samples/003.wav",'F1', ""],
129
+ ["./samples/004.wav",'F2', ""]]
130
+
131
+ # ASRTTS(*examples[0])
132
+ iface = gr.Interface(
133
+ fn = ASRTTS,
134
+ inputs = [
135
+ input_audio,
136
+ speaker_option,
137
+ reference_textbox,
138
+ ],
139
+ outputs = [
140
+ output_audio,
141
+ recognization_textbox
142
+ ],
143
+ examples = examples
144
+ )
145
+ iface.input_callback = ref_reg_callback
146
+ iface.launch(share=False)
local/app.whisper.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Checking
5
+ + [ ] Better saving directory
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import torch.nn as nn
10
+ import torch
11
+ import torchaudio
12
+ from transformers import pipeline
13
+ from pathlib import Path
14
+
15
+ # local import
16
+ import sys
17
+ from espnet2.bin.tts_inference import Text2Speech
18
+ from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
19
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
20
+
21
+ sys.path.append("src")
22
+
23
+ import gradio as gr
24
+
25
+ # ASR part
26
+
27
+ audio_files = [
28
+ str(x)
29
+ for x in sorted(
30
+ Path(
31
+ "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
32
+ ).glob("**/*wav")
33
+ )
34
+ ]
35
+ # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
36
+ # transcriber = pipeline(
37
+ # "automatic-speech-recognition",
38
+ # model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
39
+ # )
40
+
41
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
42
+
43
+ processor = AutoProcessor.from_pretrained("openai/whisper-medium")
44
+
45
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-medium")
46
+
47
+ # feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
48
+ # representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
49
+ # tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
50
+
51
+ import pdb
52
+ # pdb.set_trace()
53
+ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
54
+ # 【Female】kan-bayashi ljspeech parallel wavegan
55
+ # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
56
+ # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
57
+ # pdb.set_trace()
58
+
59
+ # @title English multi-speaker pretrained model { run: "auto" }
60
+ lang = "English"
61
+ tag = "kan-bayashi/libritts_xvector_vits"
62
+ # vits needs no
63
+ vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
64
+ from espnet2.bin.tts_inference import Text2Speech
65
+ from espnet2.utils.types import str_or_none
66
+
67
+ text2speech = Text2Speech.from_pretrained(
68
+ model_tag=str_or_none(tag),
69
+ vocoder_tag=str_or_none(vocoder_tag),
70
+ device="cuda",
71
+ use_att_constraint=False,
72
+ backward_window=1,
73
+ forward_window=3,
74
+ speed_control_alpha=1.0,
75
+ )
76
+
77
+ import glob
78
+ import os
79
+ import numpy as np
80
+ import kaldiio
81
+
82
+ # Get model directory path
83
+ from espnet_model_zoo.downloader import ModelDownloader
84
+
85
+ d = ModelDownloader()
86
+ model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
87
+
88
+ # Speaker x-vector selection
89
+
90
+ xvector_ark = [
91
+ p
92
+ for p in glob.glob(
93
+ f"xvector/test-clean/spk_xvector.ark", recursive=True
94
+ )
95
+ if "test" in p
96
+ ][0]
97
+ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
98
+ spks = list(xvectors.keys())
99
+
100
+ # pdb.set_trace()
101
+ # All old 20230101
102
+ # male_spks = {"Male1": "2300_131720", "Male2": "1320_122612", "Male3": "1188_133604",}
103
+ # "M4": "61_70970",
104
+ # female_spks = {"Female1": "2961_961", "Female2": "8463_287645", "Female3": "121_121726"}
105
+
106
+ # 6 scale from high to low,
107
+ male_spks = {"Male1": "4077_13751", "Male2": "1320_122612", "Male3": "7729_102255",}
108
+ female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
109
+ spks = dict(male_spks, **female_spks)
110
+ spk_names = sorted(spks.keys())
111
+
112
+
113
+ ## 20230224 Mousa: No reference,
114
+ def ASRTTS(audio_file, spk_name, ref_text=""):
115
+ spk = spks[spk_name]
116
+ spembs = xvectors[spk]
117
+ if ref_text == "":
118
+ reg_text = transcriber(audio_file)["text"]
119
+ else:
120
+ reg_text = ref_text
121
+
122
+ speech, sr = torchaudio.load(
123
+ audio_file, channels_first=True
124
+ ) # Mono channel
125
+ wav_tensor_spembs = text2speech(
126
+ text=reg_text, speech=speech, spembs=spembs
127
+ )["wav"]
128
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
129
+ sample_rate = 22050
130
+ save_id = (
131
+ "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
132
+ )
133
+ torchaudio.save(
134
+ save_id,
135
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
136
+ sample_rate=22050,
137
+ )
138
+
139
+ return save_id, reg_text
140
+
141
+
142
+ def ASRTTS_clean(audio_file, spk_name):
143
+ spk = spks[spk_name]
144
+ spembs = xvectors[spk]
145
+
146
+ reg_text = transcriber(audio_file)["text"]
147
+
148
+ speech, sr = torchaudio.load(
149
+ audio_file, channels_first=True
150
+ ) # Mono channel
151
+ wav_tensor_spembs = text2speech(
152
+ text=reg_text, speech=speech, spembs=spembs
153
+ )["wav"]
154
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
155
+ sample_rate = 22050
156
+ save_id = (
157
+ "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
158
+ )
159
+ torchaudio.save(
160
+ save_id,
161
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
162
+ sample_rate=22050,
163
+ )
164
+ return save_id
165
+
166
+
167
+ reference_textbox = gr.Textbox(
168
+ value="",
169
+ placeholder="Input reference here",
170
+ label="Reference",
171
+ )
172
+
173
+ recognization_textbox = gr.Textbox(
174
+ value="",
175
+ placeholder="Output recognization here",
176
+ label="recognization_textbox",
177
+ )
178
+
179
+ speaker_option = gr.Radio(choices=spk_names, label="Speaker")
180
+
181
+ input_audio = gr.Audio(
182
+ source="upload", type="filepath", label="Audio_to_Evaluate"
183
+ )
184
+ output_audio = gr.Audio(
185
+ source="upload", file="filepath", label="Synthesized Audio"
186
+ )
187
+ examples = [
188
+ ["./samples/001.wav", "M1", ""],
189
+ ["./samples/002.wav", "M2", ""],
190
+ ["./samples/003.wav", "F1", ""],
191
+ ["./samples/004.wav", "F2", ""],
192
+ ]
193
+
194
+
195
+ def change_audiobox(choice):
196
+ if choice == "upload":
197
+ input_audio = gr.Audio.update(source="upload", visible=True)
198
+ elif choice == "microphone":
199
+ input_audio = gr.Audio.update(source="microphone", visible=True)
200
+ else:
201
+ input_audio = gr.Audio.update(visible=False)
202
+ return input_audio
203
+
204
+
205
+ def show_icon(choice):
206
+ if choice == "Male1":
207
+ spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
208
+ elif choice == "Male2":
209
+ spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
210
+ elif choice == "Male3":
211
+ spk_icon = gr.Image.update(value="speaker_icons/male3.png", visible=True)
212
+ elif choice == "Female1":
213
+ spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
214
+ elif choice == "Female2":
215
+ spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
216
+ elif choice == "Female3":
217
+ spk_icon = gr.Image.update(value="speaker_icons/female3.png", visible=True)
218
+ return spk_icon
219
+
220
+ def get_download_file(audio_file=None):
221
+ if audio_file == None:
222
+ output_audio_file = gr.File.update(visible=False)
223
+ else:
224
+ output_audio_file = gr.File.update(visible=True)
225
+ return output_audio_file
226
+
227
+ def download_file(audio_file):
228
+ return gr.File(value=audio_file)
229
+ # pdb.set_trace()
230
+
231
+ # if __name__ == "__main__":
232
+ # file_share_app.run(port=3000)
233
+
234
+ with gr.Blocks(
235
+ analytics_enabled=False,
236
+ css=".gradio-container {background-color: #78BD91}",
237
+ ) as demo:
238
+ with gr.Column(elem_id="Column"):
239
+ input_format = gr.Radio(
240
+ choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
241
+ )
242
+ input_audio = gr.Audio(
243
+ source="microphone",
244
+ type="filepath",
245
+ label="Input Audio",
246
+ interactive=True,
247
+ visible=False,
248
+ elem_id="input_audio"
249
+ )
250
+ input_format.change(
251
+ fn=change_audiobox, inputs=input_format, outputs=input_audio
252
+ )
253
+
254
+ speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
255
+ spk_icon = gr.Image(value="speaker_icons/male1.png",
256
+ type="filepath",
257
+ image_mode="RGB",
258
+ source="upload",
259
+ shape=[50, 50],
260
+ interactive=True,
261
+ visible=True)
262
+ speaker_option.change(
263
+ fn=show_icon, inputs=speaker_option, outputs=spk_icon
264
+ )
265
+
266
+ b2 = gr.Button("Convert")
267
+
268
+ output_audio = gr.Audio(
269
+ source="upload", file="filepath", label="Converted Audio", interactive=False
270
+ )
271
+
272
+ b2.click(
273
+ ASRTTS_clean,
274
+ inputs=[input_audio, speaker_option],
275
+ outputs=output_audio,
276
+ api_name="convert"
277
+ )
278
+
279
+ # download_file("wav/001_F1_spkembs.wav")
280
+
281
+ demo.launch(share=False)
local/semi_streaming_ASR_TTS.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Checking
5
+ + [ ] Better saving directory
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import jiwer
10
+ import pdb
11
+ import torch.nn as nn
12
+ import torch
13
+ import torchaudio
14
+ from transformers import pipeline
15
+ # from time import process_time, time
16
+ from pathlib import Path
17
+ import time
18
+ # local import
19
+ import sys
20
+ from espnet2.bin.tts_inference import Text2Speech
21
+
22
+ # pdb.set_trace()
23
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
+
25
+ sys.path.append("src")
26
+
27
+ import gradio as gr
28
+
29
+ # ASR part
30
+
31
+ audio_files = [
32
+ str(x)
33
+ for x in sorted(
34
+ Path(
35
+ "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
36
+ ).glob("**/*wav")
37
+ )
38
+ ]
39
+ # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
40
+ transcriber = pipeline(
41
+ "automatic-speech-recognition",
42
+ model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
43
+ )
44
+ old_transcriber = pipeline(
45
+ "automatic-speech-recognition", "facebook/wav2vec2-base-960h"
46
+ )
47
+ # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
48
+ # 【Female】kan-bayashi ljspeech parallel wavegan
49
+ # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
50
+ # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
51
+ # pdb.set_trace()
52
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
53
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
54
+
55
+ # @title English multi-speaker pretrained model { run: "auto" }
56
+ lang = "English"
57
+ tag = "kan-bayashi/libritts_xvector_vits"
58
+ # vits needs no
59
+ vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
60
+ from espnet2.bin.tts_inference import Text2Speech
61
+ from espnet2.utils.types import str_or_none
62
+
63
+ text2speech = Text2Speech.from_pretrained(
64
+ model_tag=str_or_none(tag),
65
+ vocoder_tag=str_or_none(vocoder_tag),
66
+ device="cuda",
67
+ use_att_constraint=False,
68
+ backward_window=1,
69
+ forward_window=3,
70
+ speed_control_alpha=1.0,
71
+ )
72
+
73
+
74
+ import glob
75
+ import os
76
+ import numpy as np
77
+ import kaldiio
78
+
79
+ # Get model directory path
80
+ from espnet_model_zoo.downloader import ModelDownloader
81
+
82
+ d = ModelDownloader()
83
+ model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
84
+
85
+ # Speaker x-vector selection
86
+
87
+ xvector_ark = [
88
+ p
89
+ for p in glob.glob(
90
+ f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True
91
+ )
92
+ if "tr" in p
93
+ ][0]
94
+ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
95
+ spks = list(xvectors.keys())
96
+
97
+ male_spks = {
98
+ "M1": "2300_131720",
99
+ "M2": "1320_122612",
100
+ "M3": "1188_133604",
101
+ "M4": "61_70970",
102
+ }
103
+ female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
104
+ spks = dict(male_spks, **female_spks)
105
+ spk_names = sorted(spks.keys())
106
+
107
+ ## 20230224 Mousa: No reference,
108
+ def ASRold(audio_file):
109
+ reg_text = old_transcriber(audio_file)["text"]
110
+ return reg_text
111
+
112
+
113
+ def ASRnew(audio_file, state=""):
114
+ # pdb.set_trace()
115
+ time.sleep(2)
116
+ reg_text = transcriber(audio_file)["text"]
117
+ state += reg_text + "\n"
118
+ return state, state
119
+
120
+ def VAD(audio_file):
121
+ # pdb.set_trace()
122
+ reg_text = transcriber(audio_file)["text"]
123
+ return 1
124
+
125
+
126
+ reference_textbox = gr.Textbox(
127
+ value="",
128
+ placeholder="Input reference here",
129
+ label="Reference",
130
+ )
131
+
132
+ recognization_textbox = gr.Textbox(
133
+ value="",
134
+ placeholder="Output recognization here",
135
+ label="recognization_textbox",
136
+ )
137
+
138
+ speaker_option = gr.Radio(choices=spk_names, label="Speaker")
139
+
140
+ input_audio = gr.Audio(
141
+ source="upload", type="filepath", label="Audio_to_Evaluate"
142
+ )
143
+ output_audio = gr.Audio(
144
+ source="upload", file="filepath", label="Synthesized Audio"
145
+ )
146
+ examples = [
147
+ ["./samples/001.wav", "M1", ""],
148
+ ["./samples/002.wav", "M2", ""],
149
+ ["./samples/003.wav", "F1", ""],
150
+ ["./samples/004.wav", "F2", ""],
151
+ ]
152
+
153
+ def change_audiobox(choice):
154
+ if choice == "upload":
155
+ input_audio = gr.Audio.update(source="upload", visible=True)
156
+ elif choice == "microphone":
157
+ input_audio = gr.Audio.update(source="microphone", visible=True)
158
+ else:
159
+ input_audio = gr.Audio.update(visible=False)
160
+ return input_audio
161
+
162
+ demo = gr.Interface(
163
+ fn=ASRnew,
164
+ inputs=[
165
+ gr.Audio(source="microphone", type="filepath", streaming=True),
166
+ "state"
167
+ ],
168
+ outputs=[
169
+ "textbox",
170
+ "state"
171
+ ],
172
+ live=True)
173
+ # ASRnew(["/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav", "state"])
174
+ # VAD("/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav")
175
+ demo.launch(share=False)
local/streaming_VAD.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyaudio
2
+ import numpy as np
3
+ import webrtcvad
4
+
5
+ # Set up PyAudio
6
+ FORMAT = pyaudio.paInt16
7
+ CHANNELS = 1
8
+ RATE = 48000
9
+ CHUNK_SIZE = 960 # 20ms audio chunks
10
+ # p = pyaudio.PyAudio()
11
+
12
+ # wav = "/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav"
13
+ wav = "/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/VAD_test.wav"
14
+ import wave
15
+ wf = wave.open(wav, "rb")
16
+ # import pdb
17
+ # stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
18
+ # channels=wf.getnchannels(),
19
+ # rate=wf.getframerate(),
20
+ # output=True)
21
+ # pdb.set_trace()
22
+ # Set up VAD
23
+
24
+ def streaming_VAD(wf):
25
+ vad = webrtcvad.Vad()
26
+ vad.set_mode(2) # Aggressive mode
27
+
28
+ # Start audio stream
29
+ # stream = p.open(format=FORMAT,
30
+ # channels=CHANNELS,
31
+ # rate=RATE,
32
+ # input=True,
33
+ # frames_per_buffer=CHUNK_SIZE)
34
+
35
+ # VAD constants
36
+ MIN_SILENCE_DURATION = 2000 # in ms
37
+ MAX_SILENCE_DURATION = 4000 # in ms
38
+ BUFFER_SIZE = MAX_SILENCE_DURATION // CHUNK_SIZE
39
+ BUFFER_THRESHOLD = int(BUFFER_SIZE * 0.5)
40
+
41
+ # Initialize VAD buffer
42
+ vad_buffer = []
43
+ VAD_indicator = []
44
+ VAD_frame_indicator = []
45
+ data = wf.readframes(CHUNK_SIZE)
46
+ # Loop through audio stream
47
+ while data:
48
+ # Read audio chunk from stream
49
+ # pdb.set_trace()
50
+ # audio_chunk = np.frombuffer(stream.read(CHUNK_SIZE), dtype=np.int16)
51
+ audio_chunk = np.frombuffer(data, dtype=np.int16)
52
+ # Detect voice activity
53
+ # is_speech = vad.is_speech(audio_chunk.tobytes(), RATE)
54
+ try:
55
+ is_speech = vad.is_speech(audio_chunk, RATE)
56
+ except:
57
+ is_speech = False
58
+ vad_buffer.append(is_speech)
59
+
60
+ # If VAD buffer is full, check for silence and reset buffer
61
+ if len(vad_buffer) == BUFFER_SIZE:
62
+ # Check if buffer contains mostly silence
63
+ if vad_buffer.count(False) >= BUFFER_THRESHOLD:
64
+ # print("Slience")
65
+ # VAD_indicator.append(0)
66
+ # vad_buffer = []
67
+ return(False)
68
+ else:
69
+ # print("Voice detected!")
70
+ # VAD_indicator.append(1)
71
+ vad_buffer = vad_buffer[CHUNK_SIZE // BUFFER_SIZE:]
72
+ return(True)
73
+ data = wf.readframes(CHUNK_SIZE)
74
+
requirements.txt CHANGED
@@ -28,7 +28,7 @@ fsspec==2022.2.0
28
  future==0.18.2
29
  google-auth==2.6.0
30
  google-auth-oauthlib==0.4.6
31
- gradio==3.2
32
  grpcio==1.44.0
33
  h11==0.12.0
34
  hydra-core==1.0.7
@@ -108,3 +108,8 @@ jiwer
108
  # charset
109
 
110
  gradio
 
 
 
 
 
 
28
  future==0.18.2
29
  google-auth==2.6.0
30
  google-auth-oauthlib==0.4.6
31
+ gradio==3.18
32
  grpcio==1.44.0
33
  h11==0.12.0
34
  hydra-core==1.0.7
 
108
  # charset
109
 
110
  gradio
111
+
112
+ flask
113
+
114
+ # datasets
115
+ datasets
requirements.txt.bak.bak DELETED
@@ -1,141 +0,0 @@
1
- aiofiles==23.1.0
2
- aiohttp==3.8.4
3
- aiosignal==1.3.1
4
- altair==4.2.2
5
- antlr4-python3-runtime==4.8
6
- anyio==3.6.2
7
- appdirs==1.4.4
8
- argcomplete==2.0.0
9
- async-timeout==4.0.2
10
- asynctest==0.13.0
11
- attrs==22.2.0
12
- audioread==3.0.0
13
- beautifulsoup4==4.11.2
14
- bitarray==2.7.2
15
- black==23.1.0
16
- brotlipy==0.7.0
17
- cchardet==2.1.7
18
- chardet==5.1.0
19
- charset-normalizer==3.0.1
20
- ci-sdr==0.0.2
21
- click==8.1.3
22
- colorama==0.4.6
23
- ConfigArgParse==1.5.3
24
- ctc-segmentation==1.7.4
25
- cycler==0.11.0
26
- Cython==0.29.33
27
- decorator==5.1.1
28
- Distance==0.1.3
29
- editdistance==0.6.2
30
- einops==0.6.0
31
- entrypoints==0.4
32
- espnet==202301
33
- espnet-model-zoo==0.1.7
34
- espnet-tts-frontend==0.0.3
35
- fairseq==0.12.2
36
- fast-bss-eval==0.1.3
37
- fastapi==0.91.0
38
- ffmpy==0.3.0
39
- filelock==3.9.0
40
- fonttools==4.38.0
41
- frozenlist==1.3.3
42
- fsspec==2023.1.0
43
- g2p-en==2.1.0
44
- gdown==4.6.3
45
- gradio==3.18.0
46
- h11==0.14.0
47
- h5py==3.8.0
48
- httpcore==0.16.3
49
- httpx==0.23.3
50
- huggingface-hub==0.12.0
51
- humanfriendly==10.0
52
- hydra-core==1.0.7
53
- importlib-metadata==4.13.0
54
- importlib-resources==5.10.2
55
- inflect==6.0.2
56
- jaconv==0.3.3
57
- jamo==0.4.1
58
- Jinja2==3.1.2
59
- jiwer==2.5.1
60
- joblib==1.2.0
61
- jsonschema==4.17.3
62
- kaldiio==2.17.2
63
- kiwisolver==1.4.4
64
- Levenshtein==0.20.2
65
- librosa==0.9.2
66
- linkify-it-py==1.0.3
67
- llvmlite==0.39.1
68
- lxml==4.9.2
69
- markdown-it-py==2.1.0
70
- MarkupSafe==2.1.2
71
- matplotlib==3.5.3
72
- mdit-py-plugins==0.3.3
73
- mdurl==0.1.2
74
- mkl-fft==1.3.1
75
- mkl-service==2.4.0
76
- multidict==6.0.4
77
- mypy-extensions==1.0.0
78
- nltk==3.8.1
79
- numba==0.56.4
80
- numpy==1.21.6
81
- omegaconf==2.0.6
82
- opt-einsum==3.3.0
83
- orjson==3.8.6
84
- packaging==23.0
85
- pandas==1.3.5
86
- parallel-wavegan==0.5.5
87
- pathspec==0.11.0
88
- Pillow==9.3.0
89
- pkgutil_resolve_name==1.3.10
90
- platformdirs==3.0.0
91
- pooch==1.6.0
92
- portalocker==2.7.0
93
- protobuf==3.20.1
94
- pycryptodome==3.17
95
- pydantic==1.10.4
96
- pydub==0.25.1
97
- pyparsing==3.0.9
98
- pypinyin==0.44.0
99
- pyrsistent==0.19.3
100
- python-dateutil==2.8.2
101
- python-multipart==0.0.5
102
- pytorch-wpe==0.0.1
103
- pytz==2022.7.1
104
- pyworld==0.3.2
105
- PyYAML==6.0
106
- rapidfuzz==2.13.7
107
- regex==2022.10.31
108
- requests==2.28.2
109
- resampy==0.4.2
110
- rfc3986==1.5.0
111
- sacrebleu==2.3.1
112
- scikit-learn==1.0.2
113
- scipy==1.7.3
114
- sentencepiece==0.1.97
115
- sniffio==1.3.0
116
- soundfile==0.11.0
117
- soupsieve==2.4
118
- starlette==0.24.0
119
- tabulate==0.9.0
120
- tensorboardX==2.6
121
- threadpoolctl==3.1.0
122
- tokenizers==0.13.2
123
- toml==0.10.2
124
- tomli==2.0.1
125
- toolz==0.12.0
126
- torch==1.12.1
127
- torch-complex==0.4.3
128
- torchaudio==0.12.1
129
- torchvision==0.13.1
130
- tqdm==4.64.1
131
- transformers==4.26.1
132
- typed-ast==1.5.4
133
- typeguard==2.13.3
134
- uc-micro-py==1.0.1
135
- Unidecode==1.3.6
136
- uvicorn==0.20.0
137
- websockets==10.4
138
- xmltodict==0.13.0
139
- yarl==1.8.2
140
- yq==3.1.0
141
- zipp==3.13.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
speaker_icons/female1.png CHANGED
speaker_icons/female2.png CHANGED
speaker_icons/female3.png CHANGED
speaker_icons/male-4.png DELETED
Binary file (355 kB)
 
speaker_icons/male1.png CHANGED
speaker_icons/male3.png CHANGED
speaker_icons/male4.png ADDED