Spaces:
Build error
Build error
""" | |
TODO: | |
+ [x] Load Configuration | |
+ [ ] Checking | |
+ [ ] Better saving directory | |
""" | |
import numpy as np | |
from pathlib import Path | |
import jiwer | |
import pdb | |
import torch.nn as nn | |
import torch | |
import torchaudio | |
from transformers import pipeline | |
from time import process_time, time | |
from pathlib import Path | |
# local import | |
import sys | |
from espnet2.bin.tts_inference import Text2Speech | |
# pdb.set_trace() | |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
sys.path.append("src") | |
import gradio as gr | |
# ASR part | |
audio_files = [ | |
str(x) | |
for x in sorted( | |
Path( | |
"/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video" | |
).glob("**/*wav") | |
) | |
] | |
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))] | |
transcriber = pipeline( | |
"automatic-speech-recognition", | |
model="KevinGeng/PAL_John_128_train_dev_test_seed_1", | |
) | |
old_transcriber = pipeline( | |
"automatic-speech-recognition", "facebook/wav2vec2-base-960h" | |
) | |
whisper_transcriber = pipeline( | |
"automatic-speech-recognition", "KevinGeng/whipser_medium_en_PAL300_step25" | |
) | |
whisper_transcriber_org = pipeline( | |
"automatic-speech-recognition", "KevinGeng/whisper-medium-PAL128-25step" | |
) | |
whisper_transcriber_Tony = pipeline( | |
"automatic-speech-recognition", "KevinGeng/Tony1_AVA_script_conv_train_conv_dev" | |
) | |
whisper_transcriber_John = pipeline( | |
"automatic-speech-recognition", "KevinGeng/whipser_medium_en_PAL300_step25_step2_VTCK" | |
) | |
whisper_transcriber_Negel = pipeline( | |
"automatic-speech-recognition", "KevinGeng/Negel_152_AVA_script_conv_train_conv_dev" | |
) | |
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1") | |
# 【Female】kan-bayashi ljspeech parallel wavegan | |
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits") | |
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder | |
# pdb.set_trace() | |
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub | |
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface | |
# @title English multi-speaker pretrained model { run: "auto" } | |
lang = "English" | |
tag = "kan-bayashi/libritts_xvector_vits" | |
# vits needs no | |
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"} | |
from espnet2.bin.tts_inference import Text2Speech | |
from espnet2.utils.types import str_or_none | |
text2speech = Text2Speech.from_pretrained( | |
model_tag=str_or_none(tag), | |
vocoder_tag=str_or_none(vocoder_tag), | |
device="cuda", | |
use_att_constraint=False, | |
backward_window=1, | |
forward_window=3, | |
speed_control_alpha=1.0, | |
) | |
import glob | |
import os | |
import numpy as np | |
import kaldiio | |
# Get model directory path | |
from espnet_model_zoo.downloader import ModelDownloader | |
d = ModelDownloader() | |
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"]) | |
pdb.set_trace() | |
# Speaker x-vector selection | |
xvector_ark = [ | |
p | |
for p in glob.glob( | |
f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True | |
) | |
if "tr" in p | |
][0] | |
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)} | |
spks = list(xvectors.keys()) | |
male_spks = { | |
"M1": "2300_131720", | |
"M2": "1320_122612", | |
"M3": "1188_133604", | |
"M4": "61_70970", | |
} | |
female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"} | |
spks = dict(male_spks, **female_spks) | |
spk_names = sorted(spks.keys()) | |
## 20230224 Mousa: No reference, | |
def ASRold(audio_file): | |
reg_text = old_transcriber(audio_file)["text"] | |
return reg_text | |
def ASRnew(audio_file): | |
reg_text = transcriber(audio_file)["text"] | |
return reg_text | |
def ASRwhipser_FT(audio_file): | |
reg_text = whisper_transcriber(audio_file)["text"] | |
return reg_text | |
def ASRwhipser_Org(audio_file): | |
reg_text = whisper_transcriber_org(audio_file)["text"] | |
return reg_text | |
def ASRwhipser_Tony(audio_file): | |
reg_text = whisper_transcriber_Tony(audio_file)["text"] | |
return reg_text | |
def ASRwhipser_Negel(audio_file): | |
reg_text = whisper_transcriber_Negel(audio_file)["text"] | |
return reg_text | |
def ASRwhipser_John(audio_file): | |
reg_text = whisper_transcriber_John(audio_file)["text"] | |
return reg_text | |
# def ref_reg_callback(audio_file, spk_name, ref_text): | |
# reg_text = ref_text | |
# return audio_file, spk_name, reg_text | |
reference_textbox = gr.Textbox( | |
value="", | |
placeholder="Input reference here", | |
label="Reference", | |
) | |
recognization_textbox = gr.Textbox( | |
value="", | |
placeholder="Output recognization here", | |
label="recognization_textbox", | |
) | |
speaker_option = gr.Radio(choices=spk_names, label="Speaker") | |
# speaker_profiles = { | |
# "Male_1": "speaker_icons/male1.png", | |
# "Male_2": "speaker_icons/male2.png", | |
# "Female_1": "speaker_icons/female1.png", | |
# "Female_2": "speaker_icons/female2.png", | |
# } | |
# speaker_option = gr.Image(label="Choose your speaker profile", | |
# image_mode="RGB", | |
# options=speaker_profiles | |
# ) | |
input_audio = gr.Audio( | |
source="upload", type="filepath", label="Audio_to_Evaluate" | |
) | |
output_audio = gr.Audio( | |
source="upload", file="filepath", label="Synthesized Audio" | |
) | |
examples = [ | |
["./samples/001.wav", "M1", ""], | |
["./samples/002.wav", "M2", ""], | |
["./samples/003.wav", "F1", ""], | |
["./samples/004.wav", "F2", ""], | |
] | |
def change_audiobox(choice): | |
if choice == "upload": | |
input_audio = gr.Audio.update(source="upload", visible=True) | |
elif choice == "microphone": | |
input_audio = gr.Audio.update(source="microphone", visible=True) | |
else: | |
input_audio = gr.Audio.update(visible=False) | |
return input_audio | |
with gr.Blocks( | |
analytics_enabled=False, | |
css=".gradio-container {background-color: #78BD91}", | |
) as demo: | |
with gr.Column(): | |
input_format = gr.Radio( | |
choices=["upload", "microphone"], label="Choose your input format" | |
) | |
input_audio = gr.Audio( | |
source="upload", | |
type="filepath", | |
label="Input Audio", | |
interactive=True, | |
visible=False, | |
) | |
input_format.change( | |
fn=change_audiobox, inputs=input_format, outputs=input_audio | |
) | |
with gr.Row(): | |
b1 = gr.Button("Conventional Speech Recognition Engine") | |
t1 = gr.Textbox( | |
value="", | |
placeholder="Recognition output", | |
label="Convertional", | |
) | |
b1.click( | |
ASRold, inputs=[input_audio], outputs=t1 | |
) | |
with gr.Row(): | |
b2 = gr.Button("Laronix Speech Recognition Engine (Ver1, wav2vec2.0+CTC)") | |
t2 = gr.Textbox( | |
value="", | |
placeholder="Recognition output", | |
label="Purposed", | |
) | |
b2.click( | |
ASRnew, inputs=[input_audio], outputs=t2 | |
) | |
with gr.Row(): | |
b3 = gr.Button("Laronix Speech Recognition Engine (Ver2, Whipser)") | |
t3 = gr.Textbox( | |
value="", | |
placeholder="Recognition output", | |
label="Purposed", | |
) | |
b3.click( | |
ASRwhipser_FT, inputs=[input_audio], outputs=t3 | |
) | |
with gr.Row(): | |
b4 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with Tony)") | |
t4 = gr.Textbox( | |
value="", | |
placeholder="Recognition output", | |
label="Purposed", | |
) | |
b4.click( | |
ASRwhipser_Tony, inputs=[input_audio], outputs=t4 | |
) | |
with gr.Row(): | |
b5 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with John)") | |
t5 = gr.Textbox( | |
value="", | |
placeholder="Recognition output", | |
label="Purposed", | |
) | |
b5.click( | |
ASRwhipser_John, inputs=[input_audio], outputs=t5 | |
) | |
with gr.Row(): | |
b6 = gr.Button("Laronix Speech Recognition Engine (Whipser, FT with Negel)") | |
t6 = gr.Textbox( | |
value="", | |
placeholder="Recognition output", | |
label="Purposed", | |
) | |
b6.click( | |
ASRwhipser_Negel, inputs=[input_audio], outputs=t6 | |
) | |
demo.launch(share=True) | |