Spaces:
Build error
Build error
#TODO: | |
# + [x] Load Configuration | |
# + [ ] Checking | |
# + [ ] Better saving directory | |
from pathlib import Path | |
from transformers import pipeline | |
import torch.nn as nn | |
import torch | |
import torchaudio | |
import gradio as gr | |
import sys | |
# Local imports | |
sys.path.append("src") | |
from espnet2.bin.tts_inference import Text2Speech | |
from espnet2.utils.types import str_or_none | |
# Check if GPU is available | |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
# ASR part | |
data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video" | |
audio_files = sorted(list(Path(data_path).glob("**/*wav"))) | |
# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))) | |
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1") | |
# TTS part | |
def load_model(lang, tag, vocoder_tag): | |
if lang == "Japanese": | |
if tag == "kan-bayashi/ljspeech_parallel_wavegan": | |
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan") | |
elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan": | |
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan") | |
else: | |
raise ValueError(f"Not supported: lang={lang}, tag={tag}") | |
vocoder = None if vocoder_tag == "none" else vocoder_tag | |
elif lang == "English": | |
# VITS needs no vocoder; others do | |
if tag == "kan-bayashi/libritts_xvector_vits": | |
tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits") | |
vocoder = None | |
elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3": | |
tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3") | |
vocoder = "melgan" | |
else: | |
raise ValueError(f"Not supported: lang={lang}, tag={tag}") | |
else: | |
raise ValueError(f"Not supported: lang={lang}") | |
return tts_model, vocoder | |
tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long") | |
import pdb | |
pdb.set_trace() | |
tts_model = tts_model.to(device) | |
vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device) | |
# Gradio part | |
def synthesize(text): | |
with torch.no_grad(): | |
# Text-to-speech | |
wav = tts_model(text)[0] | |
if vocoder is not None: | |
# Apply vocoder | |
wav = vocoder.inference(wav) | |
# Convert to numpy array | |
wav = wav.squeeze().cpu().numpy() | |
return wav | |
interface = gr.Interface(synthesize, inputs="text", outputs="audio") | |
interface.launch() | |