Spaces:
Build error
Build error
""" | |
TODO: | |
+ [x] Load Configuration | |
+ [ ] Checking | |
+ [ ] Better saving directory | |
""" | |
import numpy as np | |
from pathlib import Path | |
import jiwer | |
import pdb | |
import torch.nn as nn | |
import torch | |
import torchaudio | |
from transformers import pipeline | |
from time import process_time, time | |
from pathlib import Path | |
# local import | |
import sys | |
from espnet2.bin.tts_inference import Text2Speech | |
# pdb.set_trace() | |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
sys.path.append("src") | |
# ASR part | |
audio_files = [str(x) for x in sorted(Path("/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video").glob("**/*wav"))] | |
# audio_files = [str(x) for x in sorted(Path("/mnt/Disk2/laronix/laronix_PAL_ASR_TTS/wav/20221228_video_good_normed_5").glob("**/*wav"))] | |
# pdb.set_trace() | |
# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))] | |
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1") | |
# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1") | |
# 【Female】kan-bayashi ljspeech parallel wavegan | |
# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits") | |
# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder | |
# pdb.set_trace() | |
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub | |
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface | |
#@title English multi-speaker pretrained model { run: "auto" } | |
lang = 'English' | |
# tag = 'kan-bayashi/vctk_multi_spk_vits' #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer", "kan-bayashi/libritts_xvector_conformer_fastspeech2", "kan-bayashi/libritts_gst+xvector_transformer", "kan-bayashi/libritts_gst+xvector_conformer_fastspeech2", "kan-bayashi/libritts_xvector_vits"] {type:"string"} | |
tag = 'kan-bayashi/libritts_xvector_vits' | |
# vits needs no | |
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"} | |
from espnet2.bin.tts_inference import Text2Speech | |
from espnet2.utils.types import str_or_none | |
text2speech = Text2Speech.from_pretrained( | |
model_tag=str_or_none(tag), | |
vocoder_tag=str_or_none(vocoder_tag), | |
device="cuda", | |
use_att_constraint=False, | |
backward_window=1, | |
forward_window=3, | |
speed_control_alpha=1.0, | |
) | |
import glob | |
import os | |
import numpy as np | |
import kaldiio | |
# Get model directory path | |
from espnet_model_zoo.downloader import ModelDownloader | |
d = ModelDownloader() | |
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"]) | |
# Speaker x-vector selection | |
# pdb.set_trace() | |
xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0] | |
xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)} | |
# spks = list(xvectors.keys()) | |
male_spks = {"M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970"} | |
female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"} | |
spks = dict(male_spks, **female_spks) | |
spk_names = sorted(spks.keys()) | |
# pdb.set_trace() | |
selected_xvectors = [xvectors[x] for x in spks.values()] | |
selected_xvectors_dict = dict(zip(spks.keys(), selected_xvectors)) | |
for audio_file in audio_files: | |
t_start = time() | |
text = transcriber(audio_file)['text'] | |
speech, sr = torchaudio.load(audio_file) # reference speech | |
duration = len(speech)/sr | |
for spks,spembs in selected_xvectors_dict.items(): | |
wav_tensor_spembs = text2speech(text=text, speech=speech, spembs=spembs)["wav"] | |
torchaudio.save("./wav/" + Path(audio_file).stem + "_" + spks +"_spkembs.wav", src=wav_tensor_spembs.unsqueeze(0).to("cpu"), sample_rate=22050) | |
# torchaudio.save("./wav/" + Path(audio_file).stem + "_" + spk + "_dur_t_text.wav", src=wav_tensor_duration_t_text.unsqueeze(0).to("cpu"), sample_rate=22050) |