Spaces:
Build error
Build error
remove app.ver1.py app.whisper.fine_tuned.py
Browse files- app.ver1.py +0 -72
- app.whisper.fine_tuned.py +0 -272
app.ver1.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
#TODO:
|
2 |
-
# + [x] Load Configuration
|
3 |
-
# + [ ] Checking
|
4 |
-
# + [ ] Better saving directory
|
5 |
-
|
6 |
-
from pathlib import Path
|
7 |
-
from transformers import pipeline
|
8 |
-
import torch.nn as nn
|
9 |
-
import torch
|
10 |
-
import torchaudio
|
11 |
-
import gradio as gr
|
12 |
-
import sys
|
13 |
-
|
14 |
-
# Local imports
|
15 |
-
sys.path.append("src")
|
16 |
-
from espnet2.bin.tts_inference import Text2Speech
|
17 |
-
from espnet2.utils.types import str_or_none
|
18 |
-
|
19 |
-
# Check if GPU is available
|
20 |
-
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
21 |
-
|
22 |
-
# ASR part
|
23 |
-
|
24 |
-
data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
25 |
-
audio_files = sorted(list(Path(data_path).glob("**/*wav")))
|
26 |
-
# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
|
27 |
-
|
28 |
-
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
29 |
-
|
30 |
-
# TTS part
|
31 |
-
def load_model(lang, tag, vocoder_tag):
|
32 |
-
if lang == "Japanese":
|
33 |
-
if tag == "kan-bayashi/ljspeech_parallel_wavegan":
|
34 |
-
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
|
35 |
-
elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
|
36 |
-
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
|
37 |
-
else:
|
38 |
-
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
39 |
-
vocoder = None if vocoder_tag == "none" else vocoder_tag
|
40 |
-
elif lang == "English":
|
41 |
-
# VITS needs no vocoder; others do
|
42 |
-
if tag == "kan-bayashi/libritts_xvector_vits":
|
43 |
-
tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
|
44 |
-
vocoder = None
|
45 |
-
elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
|
46 |
-
tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
|
47 |
-
vocoder = "melgan"
|
48 |
-
else:
|
49 |
-
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
|
50 |
-
else:
|
51 |
-
raise ValueError(f"Not supported: lang={lang}")
|
52 |
-
return tts_model, vocoder
|
53 |
-
|
54 |
-
tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
|
55 |
-
tts_model = tts_model.to(device)
|
56 |
-
|
57 |
-
vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
|
58 |
-
|
59 |
-
# Gradio part
|
60 |
-
def synthesize(text):
|
61 |
-
with torch.no_grad():
|
62 |
-
# Text-to-speech
|
63 |
-
wav = tts_model(text)[0]
|
64 |
-
if vocoder is not None:
|
65 |
-
# Apply vocoder
|
66 |
-
wav = vocoder.inference(wav)
|
67 |
-
# Convert to numpy array
|
68 |
-
wav = wav.squeeze().cpu().numpy()
|
69 |
-
return wav
|
70 |
-
|
71 |
-
interface = gr.Interface(synthesize, inputs="text", outputs="audio")
|
72 |
-
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.whisper.fine_tuned.py
DELETED
@@ -1,272 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
TODO:
|
3 |
-
+ [x] Load Configuration
|
4 |
-
+ [ ] Checking
|
5 |
-
+ [ ] Better saving directory
|
6 |
-
"""
|
7 |
-
import numpy as np
|
8 |
-
from pathlib import Path
|
9 |
-
import torch.nn as nn
|
10 |
-
import torch
|
11 |
-
import torchaudio
|
12 |
-
from transformers import pipeline
|
13 |
-
from pathlib import Path
|
14 |
-
|
15 |
-
# local import
|
16 |
-
import sys
|
17 |
-
from espnet2.bin.tts_inference import Text2Speech
|
18 |
-
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
|
19 |
-
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
20 |
-
|
21 |
-
sys.path.append("src")
|
22 |
-
|
23 |
-
import gradio as gr
|
24 |
-
|
25 |
-
# ASR part
|
26 |
-
|
27 |
-
audio_files = [
|
28 |
-
str(x)
|
29 |
-
for x in sorted(
|
30 |
-
Path(
|
31 |
-
"/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
|
32 |
-
).glob("**/*wav")
|
33 |
-
)
|
34 |
-
]
|
35 |
-
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
|
36 |
-
# transcriber = pipeline(
|
37 |
-
# "automatic-speech-recognition",
|
38 |
-
# model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
|
39 |
-
# )
|
40 |
-
|
41 |
-
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
42 |
-
|
43 |
-
processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
|
44 |
-
|
45 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
|
46 |
-
|
47 |
-
# feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
48 |
-
# representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
49 |
-
# tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
|
50 |
-
|
51 |
-
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
|
52 |
-
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
|
53 |
-
# 【Female】kan-bayashi ljspeech parallel wavegan
|
54 |
-
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
|
55 |
-
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
|
56 |
-
# pdb.set_trace()
|
57 |
-
|
58 |
-
# @title English multi-speaker pretrained model { run: "auto" }
|
59 |
-
lang = "English"
|
60 |
-
tag = "kan-bayashi/libritts_xvector_vits"
|
61 |
-
# vits needs no
|
62 |
-
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
63 |
-
from espnet2.bin.tts_inference import Text2Speech
|
64 |
-
from espnet2.utils.types import str_or_none
|
65 |
-
|
66 |
-
text2speech = Text2Speech.from_pretrained(
|
67 |
-
model_tag=str_or_none(tag),
|
68 |
-
vocoder_tag=str_or_none(vocoder_tag),
|
69 |
-
device="cuda",
|
70 |
-
use_att_constraint=False,
|
71 |
-
backward_window=1,
|
72 |
-
forward_window=3,
|
73 |
-
speed_control_alpha=1.0,
|
74 |
-
)
|
75 |
-
|
76 |
-
import glob
|
77 |
-
import os
|
78 |
-
import numpy as np
|
79 |
-
import kaldiio
|
80 |
-
|
81 |
-
# Get model directory path
|
82 |
-
from espnet_model_zoo.downloader import ModelDownloader
|
83 |
-
|
84 |
-
d = ModelDownloader()
|
85 |
-
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
|
86 |
-
|
87 |
-
# Speaker x-vector selection
|
88 |
-
|
89 |
-
xvector_ark = [
|
90 |
-
p
|
91 |
-
for p in glob.glob(
|
92 |
-
f"xvector/test-clean/spk_xvector.ark", recursive=True
|
93 |
-
)
|
94 |
-
if "test" in p
|
95 |
-
][0]
|
96 |
-
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
|
97 |
-
spks = list(xvectors.keys())
|
98 |
-
|
99 |
-
male_spks = {
|
100 |
-
"Male1": "2300_131720",
|
101 |
-
"Male2": "1320_122612",
|
102 |
-
}
|
103 |
-
# "M3": "1188_133604",
|
104 |
-
# "M4": "61_70970",
|
105 |
-
female_spks = {"Female1": "2961_961", "Female2": "8463_287645", }
|
106 |
-
# "F3": "121_121726"
|
107 |
-
spks = dict(male_spks, **female_spks)
|
108 |
-
spk_names = sorted(spks.keys())
|
109 |
-
|
110 |
-
|
111 |
-
## 20230224 Mousa: No reference,
|
112 |
-
def ASRTTS(audio_file, spk_name, ref_text=""):
|
113 |
-
spk = spks[spk_name]
|
114 |
-
spembs = xvectors[spk]
|
115 |
-
if ref_text == "":
|
116 |
-
reg_text = transcriber(audio_file)["text"]
|
117 |
-
else:
|
118 |
-
reg_text = ref_text
|
119 |
-
|
120 |
-
speech, sr = torchaudio.load(
|
121 |
-
audio_file, channels_first=True
|
122 |
-
) # Mono channel
|
123 |
-
wav_tensor_spembs = text2speech(
|
124 |
-
text=reg_text, speech=speech, spembs=spembs
|
125 |
-
)["wav"]
|
126 |
-
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
127 |
-
sample_rate = 22050
|
128 |
-
save_id = (
|
129 |
-
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
|
130 |
-
)
|
131 |
-
torchaudio.save(
|
132 |
-
save_id,
|
133 |
-
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
134 |
-
sample_rate=22050,
|
135 |
-
)
|
136 |
-
|
137 |
-
return save_id, reg_text
|
138 |
-
|
139 |
-
|
140 |
-
def ASRTTS_clean(audio_file, spk_name):
|
141 |
-
spk = spks[spk_name]
|
142 |
-
spembs = xvectors[spk]
|
143 |
-
|
144 |
-
reg_text = transcriber(audio_file)["text"]
|
145 |
-
|
146 |
-
speech, sr = torchaudio.load(
|
147 |
-
audio_file, channels_first=True
|
148 |
-
) # Mono channel
|
149 |
-
wav_tensor_spembs = text2speech(
|
150 |
-
text=reg_text, speech=speech, spembs=spembs
|
151 |
-
)["wav"]
|
152 |
-
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
153 |
-
sample_rate = 22050
|
154 |
-
save_id = (
|
155 |
-
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
|
156 |
-
)
|
157 |
-
torchaudio.save(
|
158 |
-
save_id,
|
159 |
-
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
160 |
-
sample_rate=22050,
|
161 |
-
)
|
162 |
-
return save_id
|
163 |
-
|
164 |
-
|
165 |
-
reference_textbox = gr.Textbox(
|
166 |
-
value="",
|
167 |
-
placeholder="Input reference here",
|
168 |
-
label="Reference",
|
169 |
-
)
|
170 |
-
|
171 |
-
recognization_textbox = gr.Textbox(
|
172 |
-
value="",
|
173 |
-
placeholder="Output recognization here",
|
174 |
-
label="recognization_textbox",
|
175 |
-
)
|
176 |
-
|
177 |
-
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
|
178 |
-
|
179 |
-
input_audio = gr.Audio(
|
180 |
-
source="upload", type="filepath", label="Audio_to_Evaluate"
|
181 |
-
)
|
182 |
-
output_audio = gr.Audio(
|
183 |
-
source="upload", file="filepath", label="Synthesized Audio"
|
184 |
-
)
|
185 |
-
examples = [
|
186 |
-
["./samples/001.wav", "M1", ""],
|
187 |
-
["./samples/002.wav", "M2", ""],
|
188 |
-
["./samples/003.wav", "F1", ""],
|
189 |
-
["./samples/004.wav", "F2", ""],
|
190 |
-
]
|
191 |
-
|
192 |
-
|
193 |
-
def change_audiobox(choice):
|
194 |
-
if choice == "upload":
|
195 |
-
input_audio = gr.Audio.update(source="upload", visible=True)
|
196 |
-
elif choice == "microphone":
|
197 |
-
input_audio = gr.Audio.update(source="microphone", visible=True)
|
198 |
-
else:
|
199 |
-
input_audio = gr.Audio.update(visible=False)
|
200 |
-
return input_audio
|
201 |
-
|
202 |
-
|
203 |
-
def show_icon(choice):
|
204 |
-
if choice == "Male1":
|
205 |
-
spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
|
206 |
-
elif choice == "Male2":
|
207 |
-
spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
|
208 |
-
elif choice == "Female1":
|
209 |
-
spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
|
210 |
-
elif choice == "Female2":
|
211 |
-
spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
|
212 |
-
return spk_icon
|
213 |
-
|
214 |
-
def get_download_file(audio_file=None):
|
215 |
-
if audio_file == None:
|
216 |
-
output_audio_file = gr.File.update(visible=False)
|
217 |
-
else:
|
218 |
-
output_audio_file = gr.File.update(visible=True)
|
219 |
-
return output_audio_file
|
220 |
-
|
221 |
-
def download_file(audio_file):
|
222 |
-
return gr.File(value=audio_file)
|
223 |
-
# pdb.set_trace()
|
224 |
-
|
225 |
-
with gr.Blocks(
|
226 |
-
analytics_enabled=False,
|
227 |
-
css=".gradio-container {background-color: #78BD91}",
|
228 |
-
) as demo:
|
229 |
-
with gr.Column(elem_id="Column"):
|
230 |
-
input_format = gr.Radio(
|
231 |
-
choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
|
232 |
-
)
|
233 |
-
input_audio = gr.Audio(
|
234 |
-
source="microphone",
|
235 |
-
type="filepath",
|
236 |
-
label="Input Audio",
|
237 |
-
interactive=True,
|
238 |
-
visible=False,
|
239 |
-
elem_id="input_audio"
|
240 |
-
)
|
241 |
-
input_format.change(
|
242 |
-
fn=change_audiobox, inputs=input_format, outputs=input_audio
|
243 |
-
)
|
244 |
-
|
245 |
-
speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
|
246 |
-
spk_icon = gr.Image(value="speaker_icons/male1.png",
|
247 |
-
type="filepath",
|
248 |
-
image_mode="RGB",
|
249 |
-
source="upload",
|
250 |
-
shape=[50, 50],
|
251 |
-
interactive=True,
|
252 |
-
visible=True)
|
253 |
-
speaker_option.change(
|
254 |
-
fn=show_icon, inputs=speaker_option, outputs=spk_icon
|
255 |
-
)
|
256 |
-
|
257 |
-
b2 = gr.Button("Convert")
|
258 |
-
|
259 |
-
output_audio = gr.Audio(
|
260 |
-
source="upload", file="filepath", label="Converted Audio", interactive=False
|
261 |
-
)
|
262 |
-
|
263 |
-
b2.click(
|
264 |
-
ASRTTS_clean,
|
265 |
-
inputs=[input_audio, speaker_option],
|
266 |
-
outputs=output_audio,
|
267 |
-
api_name="convert"
|
268 |
-
)
|
269 |
-
|
270 |
-
# download_file("wav/001_F1_spkembs.wav")
|
271 |
-
|
272 |
-
demo.launch(share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|