Laronix_ASR_TTS_VC / local /app.whisper.py
KevinGeng's picture
Update ASR engine to whisper based
f5460b4
"""
TODO:
+ [x] Load Configuration
+ [ ] Checking
+ [ ] Better saving directory
"""
import numpy as np
from pathlib import Path
import torch.nn as nn
import torch
import torchaudio
from transformers import pipeline
from pathlib import Path
# local import
import sys
from espnet2.bin.tts_inference import Text2Speech
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
sys.path.append("src")
import gradio as gr
# ASR part
audio_files = [
str(x)
for x in sorted(
Path(
"/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
).glob("**/*wav")
)
]
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
# transcriber = pipeline(
# "automatic-speech-recognition",
# model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
# )
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
processor = AutoProcessor.from_pretrained("openai/whisper-medium")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-medium")
# feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
# representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
# tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
import pdb
# pdb.set_trace()
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
# 【Female】kan-bayashi ljspeech parallel wavegan
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
# pdb.set_trace()
# @title English multi-speaker pretrained model { run: "auto" }
lang = "English"
tag = "kan-bayashi/libritts_xvector_vits"
# vits needs no
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
text2speech = Text2Speech.from_pretrained(
model_tag=str_or_none(tag),
vocoder_tag=str_or_none(vocoder_tag),
device="cuda",
use_att_constraint=False,
backward_window=1,
forward_window=3,
speed_control_alpha=1.0,
)
import glob
import os
import numpy as np
import kaldiio
# Get model directory path
from espnet_model_zoo.downloader import ModelDownloader
d = ModelDownloader()
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
# Speaker x-vector selection
xvector_ark = [
p
for p in glob.glob(
f"xvector/test-clean/spk_xvector.ark", recursive=True
)
if "test" in p
][0]
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
spks = list(xvectors.keys())
# pdb.set_trace()
# All old 20230101
# male_spks = {"Male1": "2300_131720", "Male2": "1320_122612", "Male3": "1188_133604",}
# "M4": "61_70970",
# female_spks = {"Female1": "2961_961", "Female2": "8463_287645", "Female3": "121_121726"}
# 6 scale from high to low,
male_spks = {"Male1": "4077_13751", "Male2": "1320_122612", "Male3": "7729_102255",}
female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
spks = dict(male_spks, **female_spks)
spk_names = sorted(spks.keys())
## 20230224 Mousa: No reference,
def ASRTTS(audio_file, spk_name, ref_text=""):
spk = spks[spk_name]
spembs = xvectors[spk]
if ref_text == "":
reg_text = transcriber(audio_file)["text"]
else:
reg_text = ref_text
speech, sr = torchaudio.load(
audio_file, channels_first=True
) # Mono channel
wav_tensor_spembs = text2speech(
text=reg_text, speech=speech, spembs=spembs
)["wav"]
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
sample_rate = 22050
save_id = (
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
)
torchaudio.save(
save_id,
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
sample_rate=22050,
)
return save_id, reg_text
def ASRTTS_clean(audio_file, spk_name):
spk = spks[spk_name]
spembs = xvectors[spk]
reg_text = transcriber(audio_file)["text"]
speech, sr = torchaudio.load(
audio_file, channels_first=True
) # Mono channel
wav_tensor_spembs = text2speech(
text=reg_text, speech=speech, spembs=spembs
)["wav"]
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
sample_rate = 22050
save_id = (
"./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
)
torchaudio.save(
save_id,
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
sample_rate=22050,
)
return save_id
reference_textbox = gr.Textbox(
value="",
placeholder="Input reference here",
label="Reference",
)
recognization_textbox = gr.Textbox(
value="",
placeholder="Output recognization here",
label="recognization_textbox",
)
speaker_option = gr.Radio(choices=spk_names, label="Speaker")
input_audio = gr.Audio(
source="upload", type="filepath", label="Audio_to_Evaluate"
)
output_audio = gr.Audio(
source="upload", file="filepath", label="Synthesized Audio"
)
examples = [
["./samples/001.wav", "M1", ""],
["./samples/002.wav", "M2", ""],
["./samples/003.wav", "F1", ""],
["./samples/004.wav", "F2", ""],
]
def change_audiobox(choice):
if choice == "upload":
input_audio = gr.Audio.update(source="upload", visible=True)
elif choice == "microphone":
input_audio = gr.Audio.update(source="microphone", visible=True)
else:
input_audio = gr.Audio.update(visible=False)
return input_audio
def show_icon(choice):
if choice == "Male1":
spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
elif choice == "Male2":
spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
elif choice == "Male3":
spk_icon = gr.Image.update(value="speaker_icons/male3.png", visible=True)
elif choice == "Female1":
spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
elif choice == "Female2":
spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
elif choice == "Female3":
spk_icon = gr.Image.update(value="speaker_icons/female3.png", visible=True)
return spk_icon
def get_download_file(audio_file=None):
if audio_file == None:
output_audio_file = gr.File.update(visible=False)
else:
output_audio_file = gr.File.update(visible=True)
return output_audio_file
def download_file(audio_file):
return gr.File(value=audio_file)
# pdb.set_trace()
# if __name__ == "__main__":
# file_share_app.run(port=3000)
with gr.Blocks(
analytics_enabled=False,
css=".gradio-container {background-color: #78BD91}",
) as demo:
with gr.Column(elem_id="Column"):
input_format = gr.Radio(
choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
)
input_audio = gr.Audio(
source="microphone",
type="filepath",
label="Input Audio",
interactive=True,
visible=False,
elem_id="input_audio"
)
input_format.change(
fn=change_audiobox, inputs=input_format, outputs=input_audio
)
speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
spk_icon = gr.Image(value="speaker_icons/male1.png",
type="filepath",
image_mode="RGB",
source="upload",
shape=[50, 50],
interactive=True,
visible=True)
speaker_option.change(
fn=show_icon, inputs=speaker_option, outputs=spk_icon
)
b2 = gr.Button("Convert")
output_audio = gr.Audio(
source="upload", file="filepath", label="Converted Audio", interactive=False
)
b2.click(
ASRTTS_clean,
inputs=[input_audio, speaker_option],
outputs=output_audio,
api_name="convert"
)
# download_file("wav/001_F1_spkembs.wav")
demo.launch(share=False)