Spaces:
Sleeping
Sleeping
Update ASR engine to whisper based
Browse files- .gitignore +2 -2
- app.py +20 -10
- app.ver1.py +72 -0
- app.whisper.fine_tuned.py +272 -0
- local/ASR_compare.py +90 -6
- local/ASR_conpare.py +72 -0
- local/PAL_dataset.py +34 -0
- local/app.genie.py +74 -0
- local/app.old.py +149 -0
- local/app.old.whipser.fined_tuned.py +146 -0
- local/app.vctk.py +146 -0
- local/app.whisper.py +281 -0
- local/semi_streaming_ASR_TTS.py +175 -0
- local/streaming_VAD.py +74 -0
- requirements.txt +6 -1
- requirements.txt.bak.bak +0 -141
- speaker_icons/female1.png +0 -0
- speaker_icons/female2.png +0 -0
- speaker_icons/female3.png +0 -0
- speaker_icons/male-4.png +0 -0
- speaker_icons/male1.png +0 -0
- speaker_icons/male3.png +0 -0
- speaker_icons/male4.png +0 -0
.gitignore
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
flagged
|
2 |
-
wav
|
3 |
samples
|
4 |
-
wav
|
|
|
5 |
wav.bak
|
6 |
|
7 |
model
|
|
|
1 |
flagged
|
|
|
2 |
samples
|
3 |
+
wav/*.wav
|
4 |
+
wav/**/*.wav
|
5 |
wav.bak
|
6 |
|
7 |
model
|
app.py
CHANGED
@@ -15,8 +15,7 @@ from pathlib import Path
|
|
15 |
# local import
|
16 |
import sys
|
17 |
from espnet2.bin.tts_inference import Text2Speech
|
18 |
-
|
19 |
-
# pdb.set_trace()
|
20 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
21 |
|
22 |
sys.path.append("src")
|
@@ -34,10 +33,22 @@ audio_files = [
|
|
34 |
)
|
35 |
]
|
36 |
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
37 |
-
transcriber = pipeline(
|
38 |
-
|
39 |
-
|
40 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
42 |
# 【Female】kan-bayashi ljspeech parallel wavegan
|
43 |
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
@@ -211,9 +222,6 @@ def download_file(audio_file):
|
|
211 |
return gr.File(value=audio_file)
|
212 |
# pdb.set_trace()
|
213 |
|
214 |
-
# if __name__ == "__main__":
|
215 |
-
# file_share_app.run(port=3000)
|
216 |
-
|
217 |
with gr.Blocks(
|
218 |
analytics_enabled=False,
|
219 |
css=".gradio-container {background-color: #78BD91}",
|
@@ -249,7 +257,7 @@ with gr.Blocks(
|
|
249 |
b2 = gr.Button("Convert")
|
250 |
|
251 |
output_audio = gr.Audio(
|
252 |
-
source="upload", label="Converted Audio", interactive=False
|
253 |
)
|
254 |
|
255 |
b2.click(
|
@@ -258,5 +266,7 @@ with gr.Blocks(
|
|
258 |
outputs=output_audio,
|
259 |
api_name="convert"
|
260 |
)
|
|
|
|
|
261 |
|
262 |
demo.launch(share=False)
|
|
|
15 |
# local import
|
16 |
import sys
|
17 |
from espnet2.bin.tts_inference import Text2Speech
|
18 |
+
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
|
|
|
19 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
20 |
|
21 |
sys.path.append("src")
|
|
|
33 |
)
|
34 |
]
|
35 |
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
36 |
+
# transcriber = pipeline(
|
37 |
+
# "automatic-speech-recognition",
|
38 |
+
# model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
|
39 |
+
# )
|
40 |
+
|
41 |
+
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
42 |
+
|
43 |
+
processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
|
44 |
+
|
45 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
|
46 |
+
|
47 |
+
# feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
48 |
+
# representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
49 |
+
# tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
50 |
+
|
51 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
|
52 |
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
53 |
# 【Female】kan-bayashi ljspeech parallel wavegan
|
54 |
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
|
|
222 |
return gr.File(value=audio_file)
|
223 |
# pdb.set_trace()
|
224 |
|
|
|
|
|
|
|
225 |
with gr.Blocks(
|
226 |
analytics_enabled=False,
|
227 |
css=".gradio-container {background-color: #78BD91}",
|
|
|
257 |
b2 = gr.Button("Convert")
|
258 |
|
259 |
output_audio = gr.Audio(
|
260 |
+
source="upload", file="filepath", label="Converted Audio", interactive=False
|
261 |
)
|
262 |
|
263 |
b2.click(
|
|
|
266 |
outputs=output_audio,
|
267 |
api_name="convert"
|
268 |
)
|
269 |
+
|
270 |
+
# download_file("wav/001_F1_spkembs.wav")
|
271 |
|
272 |
demo.launch(share=False)
|
app.ver1.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#TODO:
|
2 |
+
# + [x] Load Configuration
|
3 |
+
# + [ ] Checking
|
4 |
+
# + [ ] Better saving directory
|
5 |
+
|
6 |
+
from pathlib import Path
|
7 |
+
from transformers import pipeline
|
8 |
+
import torch.nn as nn
|
9 |
+
import torch
|
10 |
+
import torchaudio
|
11 |
+
import gradio as gr
|
12 |
+
import sys
|
13 |
+
|
14 |
+
# Local imports
|
15 |
+
sys.path.append("src")
|
16 |
+
from espnet2.bin.tts_inference import Text2Speech
|
17 |
+
from espnet2.utils.types import str_or_none
|
18 |
+
|
19 |
+
# Check if GPU is available
|
20 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
21 |
+
|
22 |
+
# ASR part
|
23 |
+
|
24 |
+
data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
25 |
+
audio_files = sorted(list(Path(data_path).glob("**/*wav")))
|
26 |
+
# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
|
27 |
+
|
28 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
29 |
+
|
30 |
+
# TTS part
|
31 |
+
def load_model(lang, tag, vocoder_tag):
|
32 |
+
if lang == "Japanese":
|
33 |
+
if tag == "kan-bayashi/ljspeech_parallel_wavegan":
|
34 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
|
35 |
+
elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
|
36 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
|
37 |
+
else:
|
38 |
+
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
39 |
+
vocoder = None if vocoder_tag == "none" else vocoder_tag
|
40 |
+
elif lang == "English":
|
41 |
+
# VITS needs no vocoder; others do
|
42 |
+
if tag == "kan-bayashi/libritts_xvector_vits":
|
43 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
|
44 |
+
vocoder = None
|
45 |
+
elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
|
46 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
|
47 |
+
vocoder = "melgan"
|
48 |
+
else:
|
49 |
+
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
50 |
+
else:
|
51 |
+
raise ValueError(f"Not supported: lang={lang}")
|
52 |
+
return tts_model, vocoder
|
53 |
+
|
54 |
+
tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
|
55 |
+
tts_model = tts_model.to(device)
|
56 |
+
|
57 |
+
vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
|
58 |
+
|
59 |
+
# Gradio part
|
60 |
+
def synthesize(text):
|
61 |
+
with torch.no_grad():
|
62 |
+
# Text-to-speech
|
63 |
+
wav = tts_model(text)[0]
|
64 |
+
if vocoder is not None:
|
65 |
+
# Apply vocoder
|
66 |
+
wav = vocoder.inference(wav)
|
67 |
+
# Convert to numpy array
|
68 |
+
wav = wav.squeeze().cpu().numpy()
|
69 |
+
return wav
|
70 |
+
|
71 |
+
interface = gr.Interface(synthesize, inputs="text", outputs="audio")
|
72 |
+
interface.launch()
|
app.whisper.fine_tuned.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
+ [x] Load Configuration
|
4 |
+
+ [ ] Checking
|
5 |
+
+ [ ] Better saving directory
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
import torch.nn as nn
|
10 |
+
import torch
|
11 |
+
import torchaudio
|
12 |
+
from transformers import pipeline
|
13 |
+
from pathlib import Path
|
14 |
+
|
15 |
+
# local import
|
16 |
+
import sys
|
17 |
+
from espnet2.bin.tts_inference import Text2Speech
|
18 |
+
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
|
19 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
20 |
+
|
21 |
+
sys.path.append("src")
|
22 |
+
|
23 |
+
import gradio as gr
|
24 |
+
|
25 |
+
# ASR part
|
26 |
+
|
27 |
+
audio_files = [
|
28 |
+
str(x)
|
29 |
+
for x in sorted(
|
30 |
+
Path(
|
31 |
+
"/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
32 |
+
).glob("**/*wav")
|
33 |
+
)
|
34 |
+
]
|
35 |
+
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
36 |
+
# transcriber = pipeline(
|
37 |
+
# "automatic-speech-recognition",
|
38 |
+
# model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
|
39 |
+
# )
|
40 |
+
|
41 |
+
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
42 |
+
|
43 |
+
processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
|
44 |
+
|
45 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
|
46 |
+
|
47 |
+
# feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
48 |
+
# representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
49 |
+
# tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
50 |
+
|
51 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
|
52 |
+
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
53 |
+
# 【Female】kan-bayashi ljspeech parallel wavegan
|
54 |
+
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
55 |
+
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
|
56 |
+
# pdb.set_trace()
|
57 |
+
|
58 |
+
# @title English multi-speaker pretrained model { run: "auto" }
|
59 |
+
lang = "English"
|
60 |
+
tag = "kan-bayashi/libritts_xvector_vits"
|
61 |
+
# vits needs no
|
62 |
+
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
63 |
+
from espnet2.bin.tts_inference import Text2Speech
|
64 |
+
from espnet2.utils.types import str_or_none
|
65 |
+
|
66 |
+
text2speech = Text2Speech.from_pretrained(
|
67 |
+
model_tag=str_or_none(tag),
|
68 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
69 |
+
device="cuda",
|
70 |
+
use_att_constraint=False,
|
71 |
+
backward_window=1,
|
72 |
+
forward_window=3,
|
73 |
+
speed_control_alpha=1.0,
|
74 |
+
)
|
75 |
+
|
76 |
+
import glob
|
77 |
+
import os
|
78 |
+
import numpy as np
|
79 |
+
import kaldiio
|
80 |
+
|
81 |
+
# Get model directory path
|
82 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
83 |
+
|
84 |
+
d = ModelDownloader()
|
85 |
+
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
86 |
+
|
87 |
+
# Speaker x-vector selection
|
88 |
+
|
89 |
+
xvector_ark = [
|
90 |
+
p
|
91 |
+
for p in glob.glob(
|
92 |
+
f"xvector/test-clean/spk_xvector.ark", recursive=True
|
93 |
+
)
|
94 |
+
if "test" in p
|
95 |
+
][0]
|
96 |
+
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
97 |
+
spks = list(xvectors.keys())
|
98 |
+
|
99 |
+
male_spks = {
|
100 |
+
"Male1": "2300_131720",
|
101 |
+
"Male2": "1320_122612",
|
102 |
+
}
|
103 |
+
# "M3": "1188_133604",
|
104 |
+
# "M4": "61_70970",
|
105 |
+
female_spks = {"Female1": "2961_961", "Female2": "8463_287645", }
|
106 |
+
# "F3": "121_121726"
|
107 |
+
spks = dict(male_spks, **female_spks)
|
108 |
+
spk_names = sorted(spks.keys())
|
109 |
+
|
110 |
+
|
111 |
+
## 20230224 Mousa: No reference,
|
112 |
+
def ASRTTS(audio_file, spk_name, ref_text=""):
|
113 |
+
spk = spks[spk_name]
|
114 |
+
spembs = xvectors[spk]
|
115 |
+
if ref_text == "":
|
116 |
+
reg_text = transcriber(audio_file)["text"]
|
117 |
+
else:
|
118 |
+
reg_text = ref_text
|
119 |
+
|
120 |
+
speech, sr = torchaudio.load(
|
121 |
+
audio_file, channels_first=True
|
122 |
+
) # Mono channel
|
123 |
+
wav_tensor_spembs = text2speech(
|
124 |
+
text=reg_text, speech=speech, spembs=spembs
|
125 |
+
)["wav"]
|
126 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
127 |
+
sample_rate = 22050
|
128 |
+
save_id = (
|
129 |
+
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
|
130 |
+
)
|
131 |
+
torchaudio.save(
|
132 |
+
save_id,
|
133 |
+
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
134 |
+
sample_rate=22050,
|
135 |
+
)
|
136 |
+
|
137 |
+
return save_id, reg_text
|
138 |
+
|
139 |
+
|
140 |
+
def ASRTTS_clean(audio_file, spk_name):
|
141 |
+
spk = spks[spk_name]
|
142 |
+
spembs = xvectors[spk]
|
143 |
+
|
144 |
+
reg_text = transcriber(audio_file)["text"]
|
145 |
+
|
146 |
+
speech, sr = torchaudio.load(
|
147 |
+
audio_file, channels_first=True
|
148 |
+
) # Mono channel
|
149 |
+
wav_tensor_spembs = text2speech(
|
150 |
+
text=reg_text, speech=speech, spembs=spembs
|
151 |
+
)["wav"]
|
152 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
153 |
+
sample_rate = 22050
|
154 |
+
save_id = (
|
155 |
+
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
|
156 |
+
)
|
157 |
+
torchaudio.save(
|
158 |
+
save_id,
|
159 |
+
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
160 |
+
sample_rate=22050,
|
161 |
+
)
|
162 |
+
return save_id
|
163 |
+
|
164 |
+
|
165 |
+
reference_textbox = gr.Textbox(
|
166 |
+
value="",
|
167 |
+
placeholder="Input reference here",
|
168 |
+
label="Reference",
|
169 |
+
)
|
170 |
+
|
171 |
+
recognization_textbox = gr.Textbox(
|
172 |
+
value="",
|
173 |
+
placeholder="Output recognization here",
|
174 |
+
label="recognization_textbox",
|
175 |
+
)
|
176 |
+
|
177 |
+
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
|
178 |
+
|
179 |
+
input_audio = gr.Audio(
|
180 |
+
source="upload", type="filepath", label="Audio_to_Evaluate"
|
181 |
+
)
|
182 |
+
output_audio = gr.Audio(
|
183 |
+
source="upload", file="filepath", label="Synthesized Audio"
|
184 |
+
)
|
185 |
+
examples = [
|
186 |
+
["./samples/001.wav", "M1", ""],
|
187 |
+
["./samples/002.wav", "M2", ""],
|
188 |
+
["./samples/003.wav", "F1", ""],
|
189 |
+
["./samples/004.wav", "F2", ""],
|
190 |
+
]
|
191 |
+
|
192 |
+
|
193 |
+
def change_audiobox(choice):
|
194 |
+
if choice == "upload":
|
195 |
+
input_audio = gr.Audio.update(source="upload", visible=True)
|
196 |
+
elif choice == "microphone":
|
197 |
+
input_audio = gr.Audio.update(source="microphone", visible=True)
|
198 |
+
else:
|
199 |
+
input_audio = gr.Audio.update(visible=False)
|
200 |
+
return input_audio
|
201 |
+
|
202 |
+
|
203 |
+
def show_icon(choice):
|
204 |
+
if choice == "Male1":
|
205 |
+
spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
|
206 |
+
elif choice == "Male2":
|
207 |
+
spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
|
208 |
+
elif choice == "Female1":
|
209 |
+
spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
|
210 |
+
elif choice == "Female2":
|
211 |
+
spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
|
212 |
+
return spk_icon
|
213 |
+
|
214 |
+
def get_download_file(audio_file=None):
|
215 |
+
if audio_file == None:
|
216 |
+
output_audio_file = gr.File.update(visible=False)
|
217 |
+
else:
|
218 |
+
output_audio_file = gr.File.update(visible=True)
|
219 |
+
return output_audio_file
|
220 |
+
|
221 |
+
def download_file(audio_file):
|
222 |
+
return gr.File(value=audio_file)
|
223 |
+
# pdb.set_trace()
|
224 |
+
|
225 |
+
with gr.Blocks(
|
226 |
+
analytics_enabled=False,
|
227 |
+
css=".gradio-container {background-color: #78BD91}",
|
228 |
+
) as demo:
|
229 |
+
with gr.Column(elem_id="Column"):
|
230 |
+
input_format = gr.Radio(
|
231 |
+
choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
|
232 |
+
)
|
233 |
+
input_audio = gr.Audio(
|
234 |
+
source="microphone",
|
235 |
+
type="filepath",
|
236 |
+
label="Input Audio",
|
237 |
+
interactive=True,
|
238 |
+
visible=False,
|
239 |
+
elem_id="input_audio"
|
240 |
+
)
|
241 |
+
input_format.change(
|
242 |
+
fn=change_audiobox, inputs=input_format, outputs=input_audio
|
243 |
+
)
|
244 |
+
|
245 |
+
speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
|
246 |
+
spk_icon = gr.Image(value="speaker_icons/male1.png",
|
247 |
+
type="filepath",
|
248 |
+
image_mode="RGB",
|
249 |
+
source="upload",
|
250 |
+
shape=[50, 50],
|
251 |
+
interactive=True,
|
252 |
+
visible=True)
|
253 |
+
speaker_option.change(
|
254 |
+
fn=show_icon, inputs=speaker_option, outputs=spk_icon
|
255 |
+
)
|
256 |
+
|
257 |
+
b2 = gr.Button("Convert")
|
258 |
+
|
259 |
+
output_audio = gr.Audio(
|
260 |
+
source="upload", file="filepath", label="Converted Audio", interactive=False
|
261 |
+
)
|
262 |
+
|
263 |
+
b2.click(
|
264 |
+
ASRTTS_clean,
|
265 |
+
inputs=[input_audio, speaker_option],
|
266 |
+
outputs=output_audio,
|
267 |
+
api_name="convert"
|
268 |
+
)
|
269 |
+
|
270 |
+
# download_file("wav/001_F1_spkembs.wav")
|
271 |
+
|
272 |
+
demo.launch(share=False)
|
local/ASR_compare.py
CHANGED
@@ -44,6 +44,26 @@ transcriber = pipeline(
|
|
44 |
old_transcriber = pipeline(
|
45 |
"automatic-speech-recognition", "facebook/wav2vec2-base-960h"
|
46 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
48 |
# 【Female】kan-bayashi ljspeech parallel wavegan
|
49 |
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
@@ -81,7 +101,7 @@ from espnet_model_zoo.downloader import ModelDownloader
|
|
81 |
|
82 |
d = ModelDownloader()
|
83 |
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
84 |
-
|
85 |
# Speaker x-vector selection
|
86 |
|
87 |
xvector_ark = [
|
@@ -92,6 +112,7 @@ xvector_ark = [
|
|
92 |
if "tr" in p
|
93 |
][0]
|
94 |
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
|
|
95 |
spks = list(xvectors.keys())
|
96 |
|
97 |
male_spks = {
|
@@ -115,6 +136,25 @@ def ASRnew(audio_file):
|
|
115 |
reg_text = transcriber(audio_file)["text"]
|
116 |
return reg_text
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
# def ref_reg_callback(audio_file, spk_name, ref_text):
|
120 |
# reg_text = ref_text
|
@@ -190,25 +230,69 @@ with gr.Blocks(
|
|
190 |
|
191 |
with gr.Row():
|
192 |
b1 = gr.Button("Conventional Speech Recognition Engine")
|
193 |
-
|
194 |
value="",
|
195 |
placeholder="Recognition output",
|
196 |
label="Convertional",
|
197 |
)
|
198 |
b1.click(
|
199 |
-
ASRold, inputs=[input_audio], outputs=
|
200 |
)
|
201 |
|
202 |
with gr.Row():
|
203 |
-
b2 = gr.Button("Laronix Speech Recognition Engine")
|
204 |
-
|
205 |
value="",
|
206 |
placeholder="Recognition output",
|
207 |
label="Purposed",
|
208 |
)
|
209 |
|
210 |
b2.click(
|
211 |
-
ASRnew, inputs=[input_audio], outputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
)
|
213 |
|
214 |
demo.launch(share=True)
|
|
|
44 |
old_transcriber = pipeline(
|
45 |
"automatic-speech-recognition", "facebook/wav2vec2-base-960h"
|
46 |
)
|
47 |
+
whisper_transcriber = pipeline(
|
48 |
+
"automatic-speech-recognition", "KevinGeng/whipser_medium_en_PAL300_step25"
|
49 |
+
)
|
50 |
+
|
51 |
+
whisper_transcriber_org = pipeline(
|
52 |
+
"automatic-speech-recognition", "KevinGeng/whisper-medium-PAL128-25step"
|
53 |
+
)
|
54 |
+
|
55 |
+
whisper_transcriber_Tony = pipeline(
|
56 |
+
"automatic-speech-recognition", "KevinGeng/Tony1_AVA_script_conv_train_conv_dev"
|
57 |
+
)
|
58 |
+
|
59 |
+
whisper_transcriber_John = pipeline(
|
60 |
+
"automatic-speech-recognition", "KevinGeng/whipser_medium_en_PAL300_step25_step2_VTCK"
|
61 |
+
)
|
62 |
+
|
63 |
+
whisper_transcriber_Negel = pipeline(
|
64 |
+
"automatic-speech-recognition", "KevinGeng/Negel_152_AVA_script_conv_train_conv_dev"
|
65 |
+
)
|
66 |
+
|
67 |
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
68 |
# 【Female】kan-bayashi ljspeech parallel wavegan
|
69 |
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
|
|
101 |
|
102 |
d = ModelDownloader()
|
103 |
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
104 |
+
pdb.set_trace()
|
105 |
# Speaker x-vector selection
|
106 |
|
107 |
xvector_ark = [
|
|
|
112 |
if "tr" in p
|
113 |
][0]
|
114 |
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
115 |
+
|
116 |
spks = list(xvectors.keys())
|
117 |
|
118 |
male_spks = {
|
|
|
136 |
reg_text = transcriber(audio_file)["text"]
|
137 |
return reg_text
|
138 |
|
139 |
+
def ASRwhipser_FT(audio_file):
|
140 |
+
reg_text = whisper_transcriber(audio_file)["text"]
|
141 |
+
return reg_text
|
142 |
+
|
143 |
+
def ASRwhipser_Org(audio_file):
|
144 |
+
reg_text = whisper_transcriber_org(audio_file)["text"]
|
145 |
+
return reg_text
|
146 |
+
|
147 |
+
def ASRwhipser_Tony(audio_file):
|
148 |
+
reg_text = whisper_transcriber_Tony(audio_file)["text"]
|
149 |
+
return reg_text
|
150 |
+
|
151 |
+
def ASRwhipser_Negel(audio_file):
|
152 |
+
reg_text = whisper_transcriber_Negel(audio_file)["text"]
|
153 |
+
return reg_text
|
154 |
+
|
155 |
+
def ASRwhipser_John(audio_file):
|
156 |
+
reg_text = whisper_transcriber_John(audio_file)["text"]
|
157 |
+
return reg_text
|
158 |
|
159 |
# def ref_reg_callback(audio_file, spk_name, ref_text):
|
160 |
# reg_text = ref_text
|
|
|
230 |
|
231 |
with gr.Row():
|
232 |
b1 = gr.Button("Conventional Speech Recognition Engine")
|
233 |
+
t1 = gr.Textbox(
|
234 |
value="",
|
235 |
placeholder="Recognition output",
|
236 |
label="Convertional",
|
237 |
)
|
238 |
b1.click(
|
239 |
+
ASRold, inputs=[input_audio], outputs=t1
|
240 |
)
|
241 |
|
242 |
with gr.Row():
|
243 |
+
b2 = gr.Button("Laronix Speech Recognition Engine (Ver1, wav2vec2.0+CTC)")
|
244 |
+
t2 = gr.Textbox(
|
245 |
value="",
|
246 |
placeholder="Recognition output",
|
247 |
label="Purposed",
|
248 |
)
|
249 |
|
250 |
b2.click(
|
251 |
+
ASRnew, inputs=[input_audio], outputs=t2
|
252 |
+
)
|
253 |
+
with gr.Row():
|
254 |
+
b3 = gr.Button("Laronix Speech Recognition Engine (Ver2, Whipser)")
|
255 |
+
t3 = gr.Textbox(
|
256 |
+
value="",
|
257 |
+
placeholder="Recognition output",
|
258 |
+
label="Purposed",
|
259 |
+
)
|
260 |
+
|
261 |
+
b3.click(
|
262 |
+
ASRwhipser_FT, inputs=[input_audio], outputs=t3
|
263 |
+
)
|
264 |
+
with gr.Row():
|
265 |
+
b4 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with Tony)")
|
266 |
+
t4 = gr.Textbox(
|
267 |
+
value="",
|
268 |
+
placeholder="Recognition output",
|
269 |
+
label="Purposed",
|
270 |
+
)
|
271 |
+
|
272 |
+
b4.click(
|
273 |
+
ASRwhipser_Tony, inputs=[input_audio], outputs=t4
|
274 |
+
)
|
275 |
+
with gr.Row():
|
276 |
+
b5 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with John)")
|
277 |
+
t5 = gr.Textbox(
|
278 |
+
value="",
|
279 |
+
placeholder="Recognition output",
|
280 |
+
label="Purposed",
|
281 |
+
)
|
282 |
+
|
283 |
+
b5.click(
|
284 |
+
ASRwhipser_John, inputs=[input_audio], outputs=t5
|
285 |
+
)
|
286 |
+
with gr.Row():
|
287 |
+
b6 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with Negel)")
|
288 |
+
t6 = gr.Textbox(
|
289 |
+
value="",
|
290 |
+
placeholder="Recognition output",
|
291 |
+
label="Purposed",
|
292 |
+
)
|
293 |
+
|
294 |
+
b6.click(
|
295 |
+
ASRwhipser_Negel, inputs=[input_audio], outputs=t6
|
296 |
)
|
297 |
|
298 |
demo.launch(share=True)
|
local/ASR_conpare.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#TODO:
|
2 |
+
# + [x] Load Configuration
|
3 |
+
# + [ ] Checking
|
4 |
+
# + [ ] Better saving directory
|
5 |
+
|
6 |
+
from pathlib import Path
|
7 |
+
from transformers import pipeline
|
8 |
+
import torch.nn as nn
|
9 |
+
import torch
|
10 |
+
import torchaudio
|
11 |
+
import gradio as gr
|
12 |
+
import sys
|
13 |
+
|
14 |
+
# Local imports
|
15 |
+
sys.path.append("src")
|
16 |
+
from espnet2.bin.tts_inference import Text2Speech
|
17 |
+
from espnet2.utils.types import str_or_none
|
18 |
+
|
19 |
+
# Check if GPU is available
|
20 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
21 |
+
|
22 |
+
# ASR part
|
23 |
+
|
24 |
+
data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
25 |
+
audio_files = sorted(list(Path(data_path).glob("**/*wav")))
|
26 |
+
# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
|
27 |
+
|
28 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
29 |
+
|
30 |
+
# TTS part
|
31 |
+
def load_model(lang, tag, vocoder_tag):
|
32 |
+
if lang == "Japanese":
|
33 |
+
if tag == "kan-bayashi/ljspeech_parallel_wavegan":
|
34 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
|
35 |
+
elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
|
36 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
|
37 |
+
else:
|
38 |
+
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
39 |
+
vocoder = None if vocoder_tag == "none" else vocoder_tag
|
40 |
+
elif lang == "English":
|
41 |
+
# VITS needs no vocoder; others do
|
42 |
+
if tag == "kan-bayashi/libritts_xvector_vits":
|
43 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
|
44 |
+
vocoder = None
|
45 |
+
elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
|
46 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
|
47 |
+
vocoder = "melgan"
|
48 |
+
else:
|
49 |
+
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
50 |
+
else:
|
51 |
+
raise ValueError(f"Not supported: lang={lang}")
|
52 |
+
return tts_model, vocoder
|
53 |
+
|
54 |
+
tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
|
55 |
+
tts_model = tts_model.to(device)
|
56 |
+
|
57 |
+
vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
|
58 |
+
|
59 |
+
# Gradio part
|
60 |
+
def synthesize(text):
|
61 |
+
with torch.no_grad():
|
62 |
+
# Text-to-speech
|
63 |
+
wav = tts_model(text)[0]
|
64 |
+
if vocoder is not None:
|
65 |
+
# Apply vocoder
|
66 |
+
wav = vocoder.inference(wav)
|
67 |
+
# Convert to numpy array
|
68 |
+
wav = wav.squeeze().cpu().numpy()
|
69 |
+
return wav
|
70 |
+
|
71 |
+
interface = gr.Interface(synthesize, inputs="text", outputs="audio")
|
72 |
+
interface.launch()
|
local/PAL_dataset.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## ADD dataset appendning
|
2 |
+
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
|
3 |
+
import pdb
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
# to_dataset = load_dataset("KevinGeng/testdataset")
|
7 |
+
base_dataset = load_dataset("../laronix_automos/data/Patient_sil_trim_16k_normed_5_snr_40")
|
8 |
+
base_extra_dataset = load_dataset("../laronix_automos/data/John_p326_large")
|
9 |
+
|
10 |
+
PAL_dataset = DatasetDict({"base": base_dataset['train'], "base_extra": base_extra_dataset['train']})
|
11 |
+
# PAL_dataset.push_to_hub("KevinGeng/PAL_dataset")
|
12 |
+
concatenate_datasets(base_dataset['train'], base_extra_dataset['train'])
|
13 |
+
pdb.set_trace()
|
14 |
+
|
15 |
+
new_record = {"audio":
|
16 |
+
{'path': 'Arthur_set1_001_noisy.wav',
|
17 |
+
'array': np.array([0.02526855, 0.04602051, 0.04873657, 0.00045776, 0.00201416, 0.00167847]),
|
18 |
+
'sampling_rate': 16000},
|
19 |
+
"transcription": "TOD"}
|
20 |
+
pdb.set_trace()
|
21 |
+
|
22 |
+
import requests
|
23 |
+
headers = {"Authorization": f"KevinGeng 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"}
|
24 |
+
# headers = {"Authorization": "Haopeng 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"}
|
25 |
+
# pdb.set_trace()
|
26 |
+
API_URL = "https://datasets-server.huggingface.co/is-valid?dataset=KevinGeng/testdataset"
|
27 |
+
|
28 |
+
def query():
|
29 |
+
response = requests.request("GET", API_URL, headers=headers)
|
30 |
+
# pdb.set_trace()
|
31 |
+
return response.json()
|
32 |
+
data = query()
|
33 |
+
|
34 |
+
pdb.set_trace()
|
local/app.genie.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#TODO:
|
2 |
+
# + [x] Load Configuration
|
3 |
+
# + [ ] Checking
|
4 |
+
# + [ ] Better saving directory
|
5 |
+
|
6 |
+
from pathlib import Path
|
7 |
+
from transformers import pipeline
|
8 |
+
import torch.nn as nn
|
9 |
+
import torch
|
10 |
+
import torchaudio
|
11 |
+
import gradio as gr
|
12 |
+
import sys
|
13 |
+
|
14 |
+
# Local imports
|
15 |
+
sys.path.append("src")
|
16 |
+
from espnet2.bin.tts_inference import Text2Speech
|
17 |
+
from espnet2.utils.types import str_or_none
|
18 |
+
|
19 |
+
# Check if GPU is available
|
20 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
21 |
+
|
22 |
+
# ASR part
|
23 |
+
|
24 |
+
data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
25 |
+
audio_files = sorted(list(Path(data_path).glob("**/*wav")))
|
26 |
+
# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
|
27 |
+
|
28 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
29 |
+
|
30 |
+
# TTS part
|
31 |
+
def load_model(lang, tag, vocoder_tag):
|
32 |
+
if lang == "Japanese":
|
33 |
+
if tag == "kan-bayashi/ljspeech_parallel_wavegan":
|
34 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
|
35 |
+
elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
|
36 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
|
37 |
+
else:
|
38 |
+
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
39 |
+
vocoder = None if vocoder_tag == "none" else vocoder_tag
|
40 |
+
elif lang == "English":
|
41 |
+
# VITS needs no vocoder; others do
|
42 |
+
if tag == "kan-bayashi/libritts_xvector_vits":
|
43 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
|
44 |
+
vocoder = None
|
45 |
+
elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
|
46 |
+
tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
|
47 |
+
vocoder = "melgan"
|
48 |
+
else:
|
49 |
+
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
50 |
+
else:
|
51 |
+
raise ValueError(f"Not supported: lang={lang}")
|
52 |
+
return tts_model, vocoder
|
53 |
+
|
54 |
+
tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
|
55 |
+
import pdb
|
56 |
+
pdb.set_trace()
|
57 |
+
tts_model = tts_model.to(device)
|
58 |
+
|
59 |
+
vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
|
60 |
+
|
61 |
+
# Gradio part
|
62 |
+
def synthesize(text):
|
63 |
+
with torch.no_grad():
|
64 |
+
# Text-to-speech
|
65 |
+
wav = tts_model(text)[0]
|
66 |
+
if vocoder is not None:
|
67 |
+
# Apply vocoder
|
68 |
+
wav = vocoder.inference(wav)
|
69 |
+
# Convert to numpy array
|
70 |
+
wav = wav.squeeze().cpu().numpy()
|
71 |
+
return wav
|
72 |
+
|
73 |
+
interface = gr.Interface(synthesize, inputs="text", outputs="audio")
|
74 |
+
interface.launch()
|
local/app.old.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
+ [x] Load Configuration
|
4 |
+
+ [ ] Checking
|
5 |
+
+ [ ] Better saving directory
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
import jiwer
|
10 |
+
import pdb
|
11 |
+
import torch.nn as nn
|
12 |
+
import torch
|
13 |
+
import torchaudio
|
14 |
+
from transformers import pipeline
|
15 |
+
from time import process_time, time
|
16 |
+
from pathlib import Path
|
17 |
+
# local import
|
18 |
+
import sys
|
19 |
+
from espnet2.bin.tts_inference import Text2Speech
|
20 |
+
# pdb.set_trace()
|
21 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
22 |
+
|
23 |
+
sys.path.append("src")
|
24 |
+
|
25 |
+
import gradio as gr
|
26 |
+
|
27 |
+
# ASR part
|
28 |
+
|
29 |
+
audio_files = [str(x) for x in sorted(Path("/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video").glob("**/*wav"))]
|
30 |
+
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
31 |
+
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
32 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
33 |
+
# 【Female】kan-bayashi ljspeech parallel wavegan
|
34 |
+
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
35 |
+
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
|
36 |
+
# pdb.set_trace()
|
37 |
+
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
|
38 |
+
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
|
39 |
+
|
40 |
+
#@title English multi-speaker pretrained model { run: "auto" }
|
41 |
+
lang = 'English'
|
42 |
+
tag = 'kan-bayashi/libritts_xvector_vits'
|
43 |
+
# tag = "kan-bayashi/vctk_multi_spk_vits"
|
44 |
+
# vits needs no
|
45 |
+
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
46 |
+
from espnet2.bin.tts_inference import Text2Speech
|
47 |
+
from espnet2.utils.types import str_or_none
|
48 |
+
|
49 |
+
text2speech = Text2Speech.from_pretrained(
|
50 |
+
model_tag=str_or_none(tag),
|
51 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
52 |
+
device="cuda",
|
53 |
+
use_att_constraint=False,
|
54 |
+
backward_window=1,
|
55 |
+
forward_window=3,
|
56 |
+
speed_control_alpha=1.0,
|
57 |
+
)
|
58 |
+
|
59 |
+
import glob
|
60 |
+
import os
|
61 |
+
import numpy as np
|
62 |
+
import kaldiio
|
63 |
+
|
64 |
+
# Get model directory path
|
65 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
66 |
+
d = ModelDownloader()
|
67 |
+
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
68 |
+
|
69 |
+
# Speaker x-vector selection
|
70 |
+
|
71 |
+
xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0]
|
72 |
+
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
73 |
+
import pdb
|
74 |
+
|
75 |
+
pdb.set_trace()
|
76 |
+
|
77 |
+
spks = list(xvectors.keys())
|
78 |
+
|
79 |
+
male_spks = {"M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970"}
|
80 |
+
female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
|
81 |
+
spks = dict(male_spks, **female_spks)
|
82 |
+
spk_names = sorted(spks.keys())
|
83 |
+
|
84 |
+
def ASRTTS(audio_file, spk_name, ref_text=""):
|
85 |
+
spk = spks[spk_name]
|
86 |
+
spembs = xvectors[spk]
|
87 |
+
if ref_text == "":
|
88 |
+
reg_text = transcriber(audio_file)['text']
|
89 |
+
else:
|
90 |
+
reg_text = ref_text
|
91 |
+
|
92 |
+
speech, sr = torchaudio.load(audio_file, channels_first=True) # Mono channel
|
93 |
+
wav_tensor_spembs = text2speech(text=reg_text, speech=speech, spembs=spembs)["wav"]
|
94 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
95 |
+
sample_rate = 22050
|
96 |
+
save_id = "./wav/" + Path(audio_file).stem + "_" + spk_name +"_spkembs.wav"
|
97 |
+
torchaudio.save(save_id, src=wav_tensor_spembs.unsqueeze(0).to("cpu"), sample_rate=22050)
|
98 |
+
|
99 |
+
return save_id, reg_text
|
100 |
+
|
101 |
+
def ref_reg_callback(audio_file, spk_name, ref_text):
|
102 |
+
reg_text = ref_text
|
103 |
+
return audio_file, spk_name, reg_text
|
104 |
+
|
105 |
+
reference_textbox = gr.Textbox(
|
106 |
+
value="",
|
107 |
+
placeholder="Input reference here",
|
108 |
+
label="Reference",
|
109 |
+
)
|
110 |
+
|
111 |
+
recognization_textbox = gr.Textbox(
|
112 |
+
value="",
|
113 |
+
placeholder="Output recognization here",
|
114 |
+
label="recognization_textbox",
|
115 |
+
)
|
116 |
+
|
117 |
+
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
|
118 |
+
|
119 |
+
input_audio = gr.Audio(
|
120 |
+
source="microphone",
|
121 |
+
type="filepath",
|
122 |
+
label="Audio_to_Evaluate"
|
123 |
+
)
|
124 |
+
output_audio = gr.Audio(
|
125 |
+
source="upload",
|
126 |
+
file="filepath",
|
127 |
+
label="Synthesized Audio"
|
128 |
+
)
|
129 |
+
examples = [["./samples/001.wav",'M1', ""],
|
130 |
+
["./samples/002.wav",'M2', ""],
|
131 |
+
["./samples/003.wav",'F1', ""],
|
132 |
+
["./samples/004.wav",'F2', ""]]
|
133 |
+
|
134 |
+
# ASRTTS(*examples[0])
|
135 |
+
iface = gr.Interface(
|
136 |
+
fn = ASRTTS,
|
137 |
+
inputs = [
|
138 |
+
input_audio,
|
139 |
+
speaker_option,
|
140 |
+
reference_textbox,
|
141 |
+
],
|
142 |
+
outputs = [
|
143 |
+
output_audio,
|
144 |
+
recognization_textbox
|
145 |
+
],
|
146 |
+
examples = examples
|
147 |
+
)
|
148 |
+
iface.input_callback = ref_reg_callback
|
149 |
+
iface.launch(share=False)
|
local/app.old.whipser.fined_tuned.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
+ [x] Load Configuration
|
4 |
+
+ [ ] Checking
|
5 |
+
+ [ ] Better saving directory
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
import jiwer
|
10 |
+
import pdb
|
11 |
+
import torch.nn as nn
|
12 |
+
import torch
|
13 |
+
import torchaudio
|
14 |
+
from transformers import pipeline
|
15 |
+
from time import process_time, time
|
16 |
+
from pathlib import Path
|
17 |
+
# local import
|
18 |
+
import sys
|
19 |
+
from espnet2.bin.tts_inference import Text2Speech
|
20 |
+
# pdb.set_trace()
|
21 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
22 |
+
|
23 |
+
sys.path.append("src")
|
24 |
+
|
25 |
+
import gradio as gr
|
26 |
+
|
27 |
+
# ASR part
|
28 |
+
|
29 |
+
audio_files = [str(x) for x in sorted(Path("/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video").glob("**/*wav"))]
|
30 |
+
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
31 |
+
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
32 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
|
33 |
+
# 【Female】kan-bayashi ljspeech parallel wavegan
|
34 |
+
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
35 |
+
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
|
36 |
+
# pdb.set_trace()
|
37 |
+
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
|
38 |
+
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
|
39 |
+
|
40 |
+
#@title English multi-speaker pretrained model { run: "auto" }
|
41 |
+
lang = 'English'
|
42 |
+
tag = 'kan-bayashi/libritts_xvector_vits'
|
43 |
+
# tag = "kan-bayashi/vctk_multi_spk_vits"
|
44 |
+
# vits needs no
|
45 |
+
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
46 |
+
from espnet2.bin.tts_inference import Text2Speech
|
47 |
+
from espnet2.utils.types import str_or_none
|
48 |
+
|
49 |
+
text2speech = Text2Speech.from_pretrained(
|
50 |
+
model_tag=str_or_none(tag),
|
51 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
52 |
+
device="cuda",
|
53 |
+
use_att_constraint=False,
|
54 |
+
backward_window=1,
|
55 |
+
forward_window=3,
|
56 |
+
speed_control_alpha=1.0,
|
57 |
+
)
|
58 |
+
|
59 |
+
|
60 |
+
import glob
|
61 |
+
import os
|
62 |
+
import numpy as np
|
63 |
+
import kaldiio
|
64 |
+
|
65 |
+
# Get model directory path
|
66 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
67 |
+
d = ModelDownloader()
|
68 |
+
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
69 |
+
|
70 |
+
# Speaker x-vector selection
|
71 |
+
|
72 |
+
xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0]
|
73 |
+
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
74 |
+
spks = list(xvectors.keys())
|
75 |
+
|
76 |
+
male_spks = {"M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970"}
|
77 |
+
female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
|
78 |
+
spks = dict(male_spks, **female_spks)
|
79 |
+
spk_names = sorted(spks.keys())
|
80 |
+
|
81 |
+
def ASRTTS(audio_file, spk_name, ref_text=""):
|
82 |
+
spk = spks[spk_name]
|
83 |
+
spembs = xvectors[spk]
|
84 |
+
if ref_text == "":
|
85 |
+
reg_text = transcriber(audio_file)['text']
|
86 |
+
else:
|
87 |
+
reg_text = ref_text
|
88 |
+
|
89 |
+
speech, sr = torchaudio.load(audio_file, channels_first=True) # Mono channel
|
90 |
+
wav_tensor_spembs = text2speech(text=reg_text, speech=speech, spembs=spembs)["wav"]
|
91 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
92 |
+
sample_rate = 22050
|
93 |
+
save_id = "./wav/" + Path(audio_file).stem + "_" + spk_name +"_spkembs.wav"
|
94 |
+
torchaudio.save(save_id, src=wav_tensor_spembs.unsqueeze(0).to("cpu"), sample_rate=22050)
|
95 |
+
|
96 |
+
return save_id, reg_text
|
97 |
+
|
98 |
+
def ref_reg_callback(audio_file, spk_name, ref_text):
|
99 |
+
reg_text = ref_text
|
100 |
+
return audio_file, spk_name, reg_text
|
101 |
+
|
102 |
+
reference_textbox = gr.Textbox(
|
103 |
+
value="",
|
104 |
+
placeholder="Input reference here",
|
105 |
+
label="Reference",
|
106 |
+
)
|
107 |
+
|
108 |
+
recognization_textbox = gr.Textbox(
|
109 |
+
value="",
|
110 |
+
placeholder="Output recognization here",
|
111 |
+
label="recognization_textbox",
|
112 |
+
)
|
113 |
+
|
114 |
+
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
|
115 |
+
|
116 |
+
input_audio = gr.Audio(
|
117 |
+
source="microphone",
|
118 |
+
type="filepath",
|
119 |
+
label="Audio_to_Evaluate"
|
120 |
+
)
|
121 |
+
output_audio = gr.Audio(
|
122 |
+
source="upload",
|
123 |
+
file="filepath",
|
124 |
+
label="Synthesized Audio"
|
125 |
+
)
|
126 |
+
examples = [["./samples/001.wav",'M1', ""],
|
127 |
+
["./samples/002.wav",'M2', ""],
|
128 |
+
["./samples/003.wav",'F1', ""],
|
129 |
+
["./samples/004.wav",'F2', ""]]
|
130 |
+
|
131 |
+
# ASRTTS(*examples[0])
|
132 |
+
iface = gr.Interface(
|
133 |
+
fn = ASRTTS,
|
134 |
+
inputs = [
|
135 |
+
input_audio,
|
136 |
+
speaker_option,
|
137 |
+
reference_textbox,
|
138 |
+
],
|
139 |
+
outputs = [
|
140 |
+
output_audio,
|
141 |
+
recognization_textbox
|
142 |
+
],
|
143 |
+
examples = examples
|
144 |
+
)
|
145 |
+
iface.input_callback = ref_reg_callback
|
146 |
+
iface.launch(share=False)
|
local/app.vctk.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
+ [x] Load Configuration
|
4 |
+
+ [ ] Checking
|
5 |
+
+ [ ] Better saving directory
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
import jiwer
|
10 |
+
import pdb
|
11 |
+
import torch.nn as nn
|
12 |
+
import torch
|
13 |
+
import torchaudio
|
14 |
+
from transformers import pipeline
|
15 |
+
from time import process_time, time
|
16 |
+
from pathlib import Path
|
17 |
+
# local import
|
18 |
+
import sys
|
19 |
+
from espnet2.bin.tts_inference import Text2Speech
|
20 |
+
# pdb.set_trace()
|
21 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
22 |
+
|
23 |
+
sys.path.append("src")
|
24 |
+
|
25 |
+
import gradio as gr
|
26 |
+
|
27 |
+
# ASR part
|
28 |
+
|
29 |
+
audio_files = [str(x) for x in sorted(Path("/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video").glob("**/*wav"))]
|
30 |
+
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
31 |
+
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
32 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
33 |
+
# 【Female】kan-bayashi ljspeech parallel wavegan
|
34 |
+
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
35 |
+
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
|
36 |
+
# pdb.set_trace()
|
37 |
+
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
|
38 |
+
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
|
39 |
+
|
40 |
+
#@title English multi-speaker pretrained model { run: "auto" }
|
41 |
+
lang = 'English'
|
42 |
+
tag = 'kan-bayashi/libritts_xvector_vits'
|
43 |
+
# tag = "kan-bayashi/vctk_multi_spk_vits"
|
44 |
+
# vits needs no
|
45 |
+
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
46 |
+
from espnet2.bin.tts_inference import Text2Speech
|
47 |
+
from espnet2.utils.types import str_or_none
|
48 |
+
|
49 |
+
text2speech = Text2Speech.from_pretrained(
|
50 |
+
model_tag=str_or_none(tag),
|
51 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
52 |
+
device="cuda",
|
53 |
+
use_att_constraint=False,
|
54 |
+
backward_window=1,
|
55 |
+
forward_window=3,
|
56 |
+
speed_control_alpha=1.0,
|
57 |
+
)
|
58 |
+
|
59 |
+
|
60 |
+
import glob
|
61 |
+
import os
|
62 |
+
import numpy as np
|
63 |
+
import kaldiio
|
64 |
+
|
65 |
+
# Get model directory path
|
66 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
67 |
+
d = ModelDownloader()
|
68 |
+
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
69 |
+
|
70 |
+
# Speaker x-vector selection
|
71 |
+
|
72 |
+
xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0]
|
73 |
+
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
74 |
+
spks = list(xvectors.keys())
|
75 |
+
|
76 |
+
male_spks = {"M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970"}
|
77 |
+
female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
|
78 |
+
spks = dict(male_spks, **female_spks)
|
79 |
+
spk_names = sorted(spks.keys())
|
80 |
+
|
81 |
+
def ASRTTS(audio_file, spk_name, ref_text=""):
|
82 |
+
spk = spks[spk_name]
|
83 |
+
spembs = xvectors[spk]
|
84 |
+
if ref_text == "":
|
85 |
+
reg_text = transcriber(audio_file)['text']
|
86 |
+
else:
|
87 |
+
reg_text = ref_text
|
88 |
+
|
89 |
+
speech, sr = torchaudio.load(audio_file, channels_first=True) # Mono channel
|
90 |
+
wav_tensor_spembs = text2speech(text=reg_text, speech=speech, spembs=spembs)["wav"]
|
91 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
92 |
+
sample_rate = 22050
|
93 |
+
save_id = "./wav/" + Path(audio_file).stem + "_" + spk_name +"_spkembs.wav"
|
94 |
+
torchaudio.save(save_id, src=wav_tensor_spembs.unsqueeze(0).to("cpu"), sample_rate=22050)
|
95 |
+
|
96 |
+
return save_id, reg_text
|
97 |
+
|
98 |
+
def ref_reg_callback(audio_file, spk_name, ref_text):
|
99 |
+
reg_text = ref_text
|
100 |
+
return audio_file, spk_name, reg_text
|
101 |
+
|
102 |
+
reference_textbox = gr.Textbox(
|
103 |
+
value="",
|
104 |
+
placeholder="Input reference here",
|
105 |
+
label="Reference",
|
106 |
+
)
|
107 |
+
|
108 |
+
recognization_textbox = gr.Textbox(
|
109 |
+
value="",
|
110 |
+
placeholder="Output recognization here",
|
111 |
+
label="recognization_textbox",
|
112 |
+
)
|
113 |
+
|
114 |
+
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
|
115 |
+
|
116 |
+
input_audio = gr.Audio(
|
117 |
+
source="microphone",
|
118 |
+
type="filepath",
|
119 |
+
label="Audio_to_Evaluate"
|
120 |
+
)
|
121 |
+
output_audio = gr.Audio(
|
122 |
+
source="upload",
|
123 |
+
file="filepath",
|
124 |
+
label="Synthesized Audio"
|
125 |
+
)
|
126 |
+
examples = [["./samples/001.wav",'M1', ""],
|
127 |
+
["./samples/002.wav",'M2', ""],
|
128 |
+
["./samples/003.wav",'F1', ""],
|
129 |
+
["./samples/004.wav",'F2', ""]]
|
130 |
+
|
131 |
+
# ASRTTS(*examples[0])
|
132 |
+
iface = gr.Interface(
|
133 |
+
fn = ASRTTS,
|
134 |
+
inputs = [
|
135 |
+
input_audio,
|
136 |
+
speaker_option,
|
137 |
+
reference_textbox,
|
138 |
+
],
|
139 |
+
outputs = [
|
140 |
+
output_audio,
|
141 |
+
recognization_textbox
|
142 |
+
],
|
143 |
+
examples = examples
|
144 |
+
)
|
145 |
+
iface.input_callback = ref_reg_callback
|
146 |
+
iface.launch(share=False)
|
local/app.whisper.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
+ [x] Load Configuration
|
4 |
+
+ [ ] Checking
|
5 |
+
+ [ ] Better saving directory
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
import torch.nn as nn
|
10 |
+
import torch
|
11 |
+
import torchaudio
|
12 |
+
from transformers import pipeline
|
13 |
+
from pathlib import Path
|
14 |
+
|
15 |
+
# local import
|
16 |
+
import sys
|
17 |
+
from espnet2.bin.tts_inference import Text2Speech
|
18 |
+
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
|
19 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
20 |
+
|
21 |
+
sys.path.append("src")
|
22 |
+
|
23 |
+
import gradio as gr
|
24 |
+
|
25 |
+
# ASR part
|
26 |
+
|
27 |
+
audio_files = [
|
28 |
+
str(x)
|
29 |
+
for x in sorted(
|
30 |
+
Path(
|
31 |
+
"/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
32 |
+
).glob("**/*wav")
|
33 |
+
)
|
34 |
+
]
|
35 |
+
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
36 |
+
# transcriber = pipeline(
|
37 |
+
# "automatic-speech-recognition",
|
38 |
+
# model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
|
39 |
+
# )
|
40 |
+
|
41 |
+
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
42 |
+
|
43 |
+
processor = AutoProcessor.from_pretrained("openai/whisper-medium")
|
44 |
+
|
45 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-medium")
|
46 |
+
|
47 |
+
# feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
48 |
+
# representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
49 |
+
# tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
50 |
+
|
51 |
+
import pdb
|
52 |
+
# pdb.set_trace()
|
53 |
+
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
54 |
+
# 【Female】kan-bayashi ljspeech parallel wavegan
|
55 |
+
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
56 |
+
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
|
57 |
+
# pdb.set_trace()
|
58 |
+
|
59 |
+
# @title English multi-speaker pretrained model { run: "auto" }
|
60 |
+
lang = "English"
|
61 |
+
tag = "kan-bayashi/libritts_xvector_vits"
|
62 |
+
# vits needs no
|
63 |
+
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
64 |
+
from espnet2.bin.tts_inference import Text2Speech
|
65 |
+
from espnet2.utils.types import str_or_none
|
66 |
+
|
67 |
+
text2speech = Text2Speech.from_pretrained(
|
68 |
+
model_tag=str_or_none(tag),
|
69 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
70 |
+
device="cuda",
|
71 |
+
use_att_constraint=False,
|
72 |
+
backward_window=1,
|
73 |
+
forward_window=3,
|
74 |
+
speed_control_alpha=1.0,
|
75 |
+
)
|
76 |
+
|
77 |
+
import glob
|
78 |
+
import os
|
79 |
+
import numpy as np
|
80 |
+
import kaldiio
|
81 |
+
|
82 |
+
# Get model directory path
|
83 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
84 |
+
|
85 |
+
d = ModelDownloader()
|
86 |
+
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
87 |
+
|
88 |
+
# Speaker x-vector selection
|
89 |
+
|
90 |
+
xvector_ark = [
|
91 |
+
p
|
92 |
+
for p in glob.glob(
|
93 |
+
f"xvector/test-clean/spk_xvector.ark", recursive=True
|
94 |
+
)
|
95 |
+
if "test" in p
|
96 |
+
][0]
|
97 |
+
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
98 |
+
spks = list(xvectors.keys())
|
99 |
+
|
100 |
+
# pdb.set_trace()
|
101 |
+
# All old 20230101
|
102 |
+
# male_spks = {"Male1": "2300_131720", "Male2": "1320_122612", "Male3": "1188_133604",}
|
103 |
+
# "M4": "61_70970",
|
104 |
+
# female_spks = {"Female1": "2961_961", "Female2": "8463_287645", "Female3": "121_121726"}
|
105 |
+
|
106 |
+
# 6 scale from high to low,
|
107 |
+
male_spks = {"Male1": "4077_13751", "Male2": "1320_122612", "Male3": "7729_102255",}
|
108 |
+
female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
|
109 |
+
spks = dict(male_spks, **female_spks)
|
110 |
+
spk_names = sorted(spks.keys())
|
111 |
+
|
112 |
+
|
113 |
+
## 20230224 Mousa: No reference,
|
114 |
+
def ASRTTS(audio_file, spk_name, ref_text=""):
|
115 |
+
spk = spks[spk_name]
|
116 |
+
spembs = xvectors[spk]
|
117 |
+
if ref_text == "":
|
118 |
+
reg_text = transcriber(audio_file)["text"]
|
119 |
+
else:
|
120 |
+
reg_text = ref_text
|
121 |
+
|
122 |
+
speech, sr = torchaudio.load(
|
123 |
+
audio_file, channels_first=True
|
124 |
+
) # Mono channel
|
125 |
+
wav_tensor_spembs = text2speech(
|
126 |
+
text=reg_text, speech=speech, spembs=spembs
|
127 |
+
)["wav"]
|
128 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
129 |
+
sample_rate = 22050
|
130 |
+
save_id = (
|
131 |
+
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
|
132 |
+
)
|
133 |
+
torchaudio.save(
|
134 |
+
save_id,
|
135 |
+
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
136 |
+
sample_rate=22050,
|
137 |
+
)
|
138 |
+
|
139 |
+
return save_id, reg_text
|
140 |
+
|
141 |
+
|
142 |
+
def ASRTTS_clean(audio_file, spk_name):
|
143 |
+
spk = spks[spk_name]
|
144 |
+
spembs = xvectors[spk]
|
145 |
+
|
146 |
+
reg_text = transcriber(audio_file)["text"]
|
147 |
+
|
148 |
+
speech, sr = torchaudio.load(
|
149 |
+
audio_file, channels_first=True
|
150 |
+
) # Mono channel
|
151 |
+
wav_tensor_spembs = text2speech(
|
152 |
+
text=reg_text, speech=speech, spembs=spembs
|
153 |
+
)["wav"]
|
154 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
155 |
+
sample_rate = 22050
|
156 |
+
save_id = (
|
157 |
+
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
|
158 |
+
)
|
159 |
+
torchaudio.save(
|
160 |
+
save_id,
|
161 |
+
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
162 |
+
sample_rate=22050,
|
163 |
+
)
|
164 |
+
return save_id
|
165 |
+
|
166 |
+
|
167 |
+
reference_textbox = gr.Textbox(
|
168 |
+
value="",
|
169 |
+
placeholder="Input reference here",
|
170 |
+
label="Reference",
|
171 |
+
)
|
172 |
+
|
173 |
+
recognization_textbox = gr.Textbox(
|
174 |
+
value="",
|
175 |
+
placeholder="Output recognization here",
|
176 |
+
label="recognization_textbox",
|
177 |
+
)
|
178 |
+
|
179 |
+
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
|
180 |
+
|
181 |
+
input_audio = gr.Audio(
|
182 |
+
source="upload", type="filepath", label="Audio_to_Evaluate"
|
183 |
+
)
|
184 |
+
output_audio = gr.Audio(
|
185 |
+
source="upload", file="filepath", label="Synthesized Audio"
|
186 |
+
)
|
187 |
+
examples = [
|
188 |
+
["./samples/001.wav", "M1", ""],
|
189 |
+
["./samples/002.wav", "M2", ""],
|
190 |
+
["./samples/003.wav", "F1", ""],
|
191 |
+
["./samples/004.wav", "F2", ""],
|
192 |
+
]
|
193 |
+
|
194 |
+
|
195 |
+
def change_audiobox(choice):
|
196 |
+
if choice == "upload":
|
197 |
+
input_audio = gr.Audio.update(source="upload", visible=True)
|
198 |
+
elif choice == "microphone":
|
199 |
+
input_audio = gr.Audio.update(source="microphone", visible=True)
|
200 |
+
else:
|
201 |
+
input_audio = gr.Audio.update(visible=False)
|
202 |
+
return input_audio
|
203 |
+
|
204 |
+
|
205 |
+
def show_icon(choice):
|
206 |
+
if choice == "Male1":
|
207 |
+
spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
|
208 |
+
elif choice == "Male2":
|
209 |
+
spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
|
210 |
+
elif choice == "Male3":
|
211 |
+
spk_icon = gr.Image.update(value="speaker_icons/male3.png", visible=True)
|
212 |
+
elif choice == "Female1":
|
213 |
+
spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
|
214 |
+
elif choice == "Female2":
|
215 |
+
spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
|
216 |
+
elif choice == "Female3":
|
217 |
+
spk_icon = gr.Image.update(value="speaker_icons/female3.png", visible=True)
|
218 |
+
return spk_icon
|
219 |
+
|
220 |
+
def get_download_file(audio_file=None):
|
221 |
+
if audio_file == None:
|
222 |
+
output_audio_file = gr.File.update(visible=False)
|
223 |
+
else:
|
224 |
+
output_audio_file = gr.File.update(visible=True)
|
225 |
+
return output_audio_file
|
226 |
+
|
227 |
+
def download_file(audio_file):
|
228 |
+
return gr.File(value=audio_file)
|
229 |
+
# pdb.set_trace()
|
230 |
+
|
231 |
+
# if __name__ == "__main__":
|
232 |
+
# file_share_app.run(port=3000)
|
233 |
+
|
234 |
+
with gr.Blocks(
|
235 |
+
analytics_enabled=False,
|
236 |
+
css=".gradio-container {background-color: #78BD91}",
|
237 |
+
) as demo:
|
238 |
+
with gr.Column(elem_id="Column"):
|
239 |
+
input_format = gr.Radio(
|
240 |
+
choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
|
241 |
+
)
|
242 |
+
input_audio = gr.Audio(
|
243 |
+
source="microphone",
|
244 |
+
type="filepath",
|
245 |
+
label="Input Audio",
|
246 |
+
interactive=True,
|
247 |
+
visible=False,
|
248 |
+
elem_id="input_audio"
|
249 |
+
)
|
250 |
+
input_format.change(
|
251 |
+
fn=change_audiobox, inputs=input_format, outputs=input_audio
|
252 |
+
)
|
253 |
+
|
254 |
+
speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
|
255 |
+
spk_icon = gr.Image(value="speaker_icons/male1.png",
|
256 |
+
type="filepath",
|
257 |
+
image_mode="RGB",
|
258 |
+
source="upload",
|
259 |
+
shape=[50, 50],
|
260 |
+
interactive=True,
|
261 |
+
visible=True)
|
262 |
+
speaker_option.change(
|
263 |
+
fn=show_icon, inputs=speaker_option, outputs=spk_icon
|
264 |
+
)
|
265 |
+
|
266 |
+
b2 = gr.Button("Convert")
|
267 |
+
|
268 |
+
output_audio = gr.Audio(
|
269 |
+
source="upload", file="filepath", label="Converted Audio", interactive=False
|
270 |
+
)
|
271 |
+
|
272 |
+
b2.click(
|
273 |
+
ASRTTS_clean,
|
274 |
+
inputs=[input_audio, speaker_option],
|
275 |
+
outputs=output_audio,
|
276 |
+
api_name="convert"
|
277 |
+
)
|
278 |
+
|
279 |
+
# download_file("wav/001_F1_spkembs.wav")
|
280 |
+
|
281 |
+
demo.launch(share=False)
|
local/semi_streaming_ASR_TTS.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
+ [x] Load Configuration
|
4 |
+
+ [ ] Checking
|
5 |
+
+ [ ] Better saving directory
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
import jiwer
|
10 |
+
import pdb
|
11 |
+
import torch.nn as nn
|
12 |
+
import torch
|
13 |
+
import torchaudio
|
14 |
+
from transformers import pipeline
|
15 |
+
# from time import process_time, time
|
16 |
+
from pathlib import Path
|
17 |
+
import time
|
18 |
+
# local import
|
19 |
+
import sys
|
20 |
+
from espnet2.bin.tts_inference import Text2Speech
|
21 |
+
|
22 |
+
# pdb.set_trace()
|
23 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
24 |
+
|
25 |
+
sys.path.append("src")
|
26 |
+
|
27 |
+
import gradio as gr
|
28 |
+
|
29 |
+
# ASR part
|
30 |
+
|
31 |
+
audio_files = [
|
32 |
+
str(x)
|
33 |
+
for x in sorted(
|
34 |
+
Path(
|
35 |
+
"/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
36 |
+
).glob("**/*wav")
|
37 |
+
)
|
38 |
+
]
|
39 |
+
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
40 |
+
transcriber = pipeline(
|
41 |
+
"automatic-speech-recognition",
|
42 |
+
model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
|
43 |
+
)
|
44 |
+
old_transcriber = pipeline(
|
45 |
+
"automatic-speech-recognition", "facebook/wav2vec2-base-960h"
|
46 |
+
)
|
47 |
+
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
48 |
+
# 【Female】kan-bayashi ljspeech parallel wavegan
|
49 |
+
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
50 |
+
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
|
51 |
+
# pdb.set_trace()
|
52 |
+
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
|
53 |
+
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
|
54 |
+
|
55 |
+
# @title English multi-speaker pretrained model { run: "auto" }
|
56 |
+
lang = "English"
|
57 |
+
tag = "kan-bayashi/libritts_xvector_vits"
|
58 |
+
# vits needs no
|
59 |
+
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
60 |
+
from espnet2.bin.tts_inference import Text2Speech
|
61 |
+
from espnet2.utils.types import str_or_none
|
62 |
+
|
63 |
+
text2speech = Text2Speech.from_pretrained(
|
64 |
+
model_tag=str_or_none(tag),
|
65 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
66 |
+
device="cuda",
|
67 |
+
use_att_constraint=False,
|
68 |
+
backward_window=1,
|
69 |
+
forward_window=3,
|
70 |
+
speed_control_alpha=1.0,
|
71 |
+
)
|
72 |
+
|
73 |
+
|
74 |
+
import glob
|
75 |
+
import os
|
76 |
+
import numpy as np
|
77 |
+
import kaldiio
|
78 |
+
|
79 |
+
# Get model directory path
|
80 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
81 |
+
|
82 |
+
d = ModelDownloader()
|
83 |
+
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
84 |
+
|
85 |
+
# Speaker x-vector selection
|
86 |
+
|
87 |
+
xvector_ark = [
|
88 |
+
p
|
89 |
+
for p in glob.glob(
|
90 |
+
f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True
|
91 |
+
)
|
92 |
+
if "tr" in p
|
93 |
+
][0]
|
94 |
+
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
95 |
+
spks = list(xvectors.keys())
|
96 |
+
|
97 |
+
male_spks = {
|
98 |
+
"M1": "2300_131720",
|
99 |
+
"M2": "1320_122612",
|
100 |
+
"M3": "1188_133604",
|
101 |
+
"M4": "61_70970",
|
102 |
+
}
|
103 |
+
female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"}
|
104 |
+
spks = dict(male_spks, **female_spks)
|
105 |
+
spk_names = sorted(spks.keys())
|
106 |
+
|
107 |
+
## 20230224 Mousa: No reference,
|
108 |
+
def ASRold(audio_file):
|
109 |
+
reg_text = old_transcriber(audio_file)["text"]
|
110 |
+
return reg_text
|
111 |
+
|
112 |
+
|
113 |
+
def ASRnew(audio_file, state=""):
|
114 |
+
# pdb.set_trace()
|
115 |
+
time.sleep(2)
|
116 |
+
reg_text = transcriber(audio_file)["text"]
|
117 |
+
state += reg_text + "\n"
|
118 |
+
return state, state
|
119 |
+
|
120 |
+
def VAD(audio_file):
|
121 |
+
# pdb.set_trace()
|
122 |
+
reg_text = transcriber(audio_file)["text"]
|
123 |
+
return 1
|
124 |
+
|
125 |
+
|
126 |
+
reference_textbox = gr.Textbox(
|
127 |
+
value="",
|
128 |
+
placeholder="Input reference here",
|
129 |
+
label="Reference",
|
130 |
+
)
|
131 |
+
|
132 |
+
recognization_textbox = gr.Textbox(
|
133 |
+
value="",
|
134 |
+
placeholder="Output recognization here",
|
135 |
+
label="recognization_textbox",
|
136 |
+
)
|
137 |
+
|
138 |
+
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
|
139 |
+
|
140 |
+
input_audio = gr.Audio(
|
141 |
+
source="upload", type="filepath", label="Audio_to_Evaluate"
|
142 |
+
)
|
143 |
+
output_audio = gr.Audio(
|
144 |
+
source="upload", file="filepath", label="Synthesized Audio"
|
145 |
+
)
|
146 |
+
examples = [
|
147 |
+
["./samples/001.wav", "M1", ""],
|
148 |
+
["./samples/002.wav", "M2", ""],
|
149 |
+
["./samples/003.wav", "F1", ""],
|
150 |
+
["./samples/004.wav", "F2", ""],
|
151 |
+
]
|
152 |
+
|
153 |
+
def change_audiobox(choice):
|
154 |
+
if choice == "upload":
|
155 |
+
input_audio = gr.Audio.update(source="upload", visible=True)
|
156 |
+
elif choice == "microphone":
|
157 |
+
input_audio = gr.Audio.update(source="microphone", visible=True)
|
158 |
+
else:
|
159 |
+
input_audio = gr.Audio.update(visible=False)
|
160 |
+
return input_audio
|
161 |
+
|
162 |
+
demo = gr.Interface(
|
163 |
+
fn=ASRnew,
|
164 |
+
inputs=[
|
165 |
+
gr.Audio(source="microphone", type="filepath", streaming=True),
|
166 |
+
"state"
|
167 |
+
],
|
168 |
+
outputs=[
|
169 |
+
"textbox",
|
170 |
+
"state"
|
171 |
+
],
|
172 |
+
live=True)
|
173 |
+
# ASRnew(["/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav", "state"])
|
174 |
+
# VAD("/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav")
|
175 |
+
demo.launch(share=False)
|
local/streaming_VAD.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pyaudio
|
2 |
+
import numpy as np
|
3 |
+
import webrtcvad
|
4 |
+
|
5 |
+
# Set up PyAudio
|
6 |
+
FORMAT = pyaudio.paInt16
|
7 |
+
CHANNELS = 1
|
8 |
+
RATE = 48000
|
9 |
+
CHUNK_SIZE = 960 # 20ms audio chunks
|
10 |
+
# p = pyaudio.PyAudio()
|
11 |
+
|
12 |
+
# wav = "/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav"
|
13 |
+
wav = "/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/VAD_test.wav"
|
14 |
+
import wave
|
15 |
+
wf = wave.open(wav, "rb")
|
16 |
+
# import pdb
|
17 |
+
# stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
|
18 |
+
# channels=wf.getnchannels(),
|
19 |
+
# rate=wf.getframerate(),
|
20 |
+
# output=True)
|
21 |
+
# pdb.set_trace()
|
22 |
+
# Set up VAD
|
23 |
+
|
24 |
+
def streaming_VAD(wf):
|
25 |
+
vad = webrtcvad.Vad()
|
26 |
+
vad.set_mode(2) # Aggressive mode
|
27 |
+
|
28 |
+
# Start audio stream
|
29 |
+
# stream = p.open(format=FORMAT,
|
30 |
+
# channels=CHANNELS,
|
31 |
+
# rate=RATE,
|
32 |
+
# input=True,
|
33 |
+
# frames_per_buffer=CHUNK_SIZE)
|
34 |
+
|
35 |
+
# VAD constants
|
36 |
+
MIN_SILENCE_DURATION = 2000 # in ms
|
37 |
+
MAX_SILENCE_DURATION = 4000 # in ms
|
38 |
+
BUFFER_SIZE = MAX_SILENCE_DURATION // CHUNK_SIZE
|
39 |
+
BUFFER_THRESHOLD = int(BUFFER_SIZE * 0.5)
|
40 |
+
|
41 |
+
# Initialize VAD buffer
|
42 |
+
vad_buffer = []
|
43 |
+
VAD_indicator = []
|
44 |
+
VAD_frame_indicator = []
|
45 |
+
data = wf.readframes(CHUNK_SIZE)
|
46 |
+
# Loop through audio stream
|
47 |
+
while data:
|
48 |
+
# Read audio chunk from stream
|
49 |
+
# pdb.set_trace()
|
50 |
+
# audio_chunk = np.frombuffer(stream.read(CHUNK_SIZE), dtype=np.int16)
|
51 |
+
audio_chunk = np.frombuffer(data, dtype=np.int16)
|
52 |
+
# Detect voice activity
|
53 |
+
# is_speech = vad.is_speech(audio_chunk.tobytes(), RATE)
|
54 |
+
try:
|
55 |
+
is_speech = vad.is_speech(audio_chunk, RATE)
|
56 |
+
except:
|
57 |
+
is_speech = False
|
58 |
+
vad_buffer.append(is_speech)
|
59 |
+
|
60 |
+
# If VAD buffer is full, check for silence and reset buffer
|
61 |
+
if len(vad_buffer) == BUFFER_SIZE:
|
62 |
+
# Check if buffer contains mostly silence
|
63 |
+
if vad_buffer.count(False) >= BUFFER_THRESHOLD:
|
64 |
+
# print("Slience")
|
65 |
+
# VAD_indicator.append(0)
|
66 |
+
# vad_buffer = []
|
67 |
+
return(False)
|
68 |
+
else:
|
69 |
+
# print("Voice detected!")
|
70 |
+
# VAD_indicator.append(1)
|
71 |
+
vad_buffer = vad_buffer[CHUNK_SIZE // BUFFER_SIZE:]
|
72 |
+
return(True)
|
73 |
+
data = wf.readframes(CHUNK_SIZE)
|
74 |
+
|
requirements.txt
CHANGED
@@ -28,7 +28,7 @@ fsspec==2022.2.0
|
|
28 |
future==0.18.2
|
29 |
google-auth==2.6.0
|
30 |
google-auth-oauthlib==0.4.6
|
31 |
-
gradio==3.
|
32 |
grpcio==1.44.0
|
33 |
h11==0.12.0
|
34 |
hydra-core==1.0.7
|
@@ -108,3 +108,8 @@ jiwer
|
|
108 |
# charset
|
109 |
|
110 |
gradio
|
|
|
|
|
|
|
|
|
|
|
|
28 |
future==0.18.2
|
29 |
google-auth==2.6.0
|
30 |
google-auth-oauthlib==0.4.6
|
31 |
+
gradio==3.18
|
32 |
grpcio==1.44.0
|
33 |
h11==0.12.0
|
34 |
hydra-core==1.0.7
|
|
|
108 |
# charset
|
109 |
|
110 |
gradio
|
111 |
+
|
112 |
+
flask
|
113 |
+
|
114 |
+
# datasets
|
115 |
+
datasets
|
requirements.txt.bak.bak
DELETED
@@ -1,141 +0,0 @@
|
|
1 |
-
aiofiles==23.1.0
|
2 |
-
aiohttp==3.8.4
|
3 |
-
aiosignal==1.3.1
|
4 |
-
altair==4.2.2
|
5 |
-
antlr4-python3-runtime==4.8
|
6 |
-
anyio==3.6.2
|
7 |
-
appdirs==1.4.4
|
8 |
-
argcomplete==2.0.0
|
9 |
-
async-timeout==4.0.2
|
10 |
-
asynctest==0.13.0
|
11 |
-
attrs==22.2.0
|
12 |
-
audioread==3.0.0
|
13 |
-
beautifulsoup4==4.11.2
|
14 |
-
bitarray==2.7.2
|
15 |
-
black==23.1.0
|
16 |
-
brotlipy==0.7.0
|
17 |
-
cchardet==2.1.7
|
18 |
-
chardet==5.1.0
|
19 |
-
charset-normalizer==3.0.1
|
20 |
-
ci-sdr==0.0.2
|
21 |
-
click==8.1.3
|
22 |
-
colorama==0.4.6
|
23 |
-
ConfigArgParse==1.5.3
|
24 |
-
ctc-segmentation==1.7.4
|
25 |
-
cycler==0.11.0
|
26 |
-
Cython==0.29.33
|
27 |
-
decorator==5.1.1
|
28 |
-
Distance==0.1.3
|
29 |
-
editdistance==0.6.2
|
30 |
-
einops==0.6.0
|
31 |
-
entrypoints==0.4
|
32 |
-
espnet==202301
|
33 |
-
espnet-model-zoo==0.1.7
|
34 |
-
espnet-tts-frontend==0.0.3
|
35 |
-
fairseq==0.12.2
|
36 |
-
fast-bss-eval==0.1.3
|
37 |
-
fastapi==0.91.0
|
38 |
-
ffmpy==0.3.0
|
39 |
-
filelock==3.9.0
|
40 |
-
fonttools==4.38.0
|
41 |
-
frozenlist==1.3.3
|
42 |
-
fsspec==2023.1.0
|
43 |
-
g2p-en==2.1.0
|
44 |
-
gdown==4.6.3
|
45 |
-
gradio==3.18.0
|
46 |
-
h11==0.14.0
|
47 |
-
h5py==3.8.0
|
48 |
-
httpcore==0.16.3
|
49 |
-
httpx==0.23.3
|
50 |
-
huggingface-hub==0.12.0
|
51 |
-
humanfriendly==10.0
|
52 |
-
hydra-core==1.0.7
|
53 |
-
importlib-metadata==4.13.0
|
54 |
-
importlib-resources==5.10.2
|
55 |
-
inflect==6.0.2
|
56 |
-
jaconv==0.3.3
|
57 |
-
jamo==0.4.1
|
58 |
-
Jinja2==3.1.2
|
59 |
-
jiwer==2.5.1
|
60 |
-
joblib==1.2.0
|
61 |
-
jsonschema==4.17.3
|
62 |
-
kaldiio==2.17.2
|
63 |
-
kiwisolver==1.4.4
|
64 |
-
Levenshtein==0.20.2
|
65 |
-
librosa==0.9.2
|
66 |
-
linkify-it-py==1.0.3
|
67 |
-
llvmlite==0.39.1
|
68 |
-
lxml==4.9.2
|
69 |
-
markdown-it-py==2.1.0
|
70 |
-
MarkupSafe==2.1.2
|
71 |
-
matplotlib==3.5.3
|
72 |
-
mdit-py-plugins==0.3.3
|
73 |
-
mdurl==0.1.2
|
74 |
-
mkl-fft==1.3.1
|
75 |
-
mkl-service==2.4.0
|
76 |
-
multidict==6.0.4
|
77 |
-
mypy-extensions==1.0.0
|
78 |
-
nltk==3.8.1
|
79 |
-
numba==0.56.4
|
80 |
-
numpy==1.21.6
|
81 |
-
omegaconf==2.0.6
|
82 |
-
opt-einsum==3.3.0
|
83 |
-
orjson==3.8.6
|
84 |
-
packaging==23.0
|
85 |
-
pandas==1.3.5
|
86 |
-
parallel-wavegan==0.5.5
|
87 |
-
pathspec==0.11.0
|
88 |
-
Pillow==9.3.0
|
89 |
-
pkgutil_resolve_name==1.3.10
|
90 |
-
platformdirs==3.0.0
|
91 |
-
pooch==1.6.0
|
92 |
-
portalocker==2.7.0
|
93 |
-
protobuf==3.20.1
|
94 |
-
pycryptodome==3.17
|
95 |
-
pydantic==1.10.4
|
96 |
-
pydub==0.25.1
|
97 |
-
pyparsing==3.0.9
|
98 |
-
pypinyin==0.44.0
|
99 |
-
pyrsistent==0.19.3
|
100 |
-
python-dateutil==2.8.2
|
101 |
-
python-multipart==0.0.5
|
102 |
-
pytorch-wpe==0.0.1
|
103 |
-
pytz==2022.7.1
|
104 |
-
pyworld==0.3.2
|
105 |
-
PyYAML==6.0
|
106 |
-
rapidfuzz==2.13.7
|
107 |
-
regex==2022.10.31
|
108 |
-
requests==2.28.2
|
109 |
-
resampy==0.4.2
|
110 |
-
rfc3986==1.5.0
|
111 |
-
sacrebleu==2.3.1
|
112 |
-
scikit-learn==1.0.2
|
113 |
-
scipy==1.7.3
|
114 |
-
sentencepiece==0.1.97
|
115 |
-
sniffio==1.3.0
|
116 |
-
soundfile==0.11.0
|
117 |
-
soupsieve==2.4
|
118 |
-
starlette==0.24.0
|
119 |
-
tabulate==0.9.0
|
120 |
-
tensorboardX==2.6
|
121 |
-
threadpoolctl==3.1.0
|
122 |
-
tokenizers==0.13.2
|
123 |
-
toml==0.10.2
|
124 |
-
tomli==2.0.1
|
125 |
-
toolz==0.12.0
|
126 |
-
torch==1.12.1
|
127 |
-
torch-complex==0.4.3
|
128 |
-
torchaudio==0.12.1
|
129 |
-
torchvision==0.13.1
|
130 |
-
tqdm==4.64.1
|
131 |
-
transformers==4.26.1
|
132 |
-
typed-ast==1.5.4
|
133 |
-
typeguard==2.13.3
|
134 |
-
uc-micro-py==1.0.1
|
135 |
-
Unidecode==1.3.6
|
136 |
-
uvicorn==0.20.0
|
137 |
-
websockets==10.4
|
138 |
-
xmltodict==0.13.0
|
139 |
-
yarl==1.8.2
|
140 |
-
yq==3.1.0
|
141 |
-
zipp==3.13.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
speaker_icons/female1.png
CHANGED
![]() |
![]() |
speaker_icons/female2.png
CHANGED
![]() |
![]() |
speaker_icons/female3.png
CHANGED
![]() |
![]() |
speaker_icons/male-4.png
DELETED
Binary file (355 kB)
|
|
speaker_icons/male1.png
CHANGED
![]() |
![]() |
speaker_icons/male3.png
CHANGED
![]() |
![]() |
speaker_icons/male4.png
ADDED
![]() |