File size: 2,737 Bytes
f5460b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#TODO:
#  + [x] Load Configuration
#  + [ ] Checking
#  + [ ] Better saving directory

from pathlib import Path
from transformers import pipeline
import torch.nn as nn
import torch
import torchaudio
import gradio as gr
import sys

# Local imports
sys.path.append("src")
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# ASR part

data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
audio_files = sorted(list(Path(data_path).glob("**/*wav")))
# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))

transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")

# TTS part
def load_model(lang, tag, vocoder_tag):
    if lang == "Japanese":
        if tag == "kan-bayashi/ljspeech_parallel_wavegan":
            tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
        elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
            tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
        else:
            raise ValueError(f"Not supported: lang={lang}, tag={tag}")
        vocoder = None if vocoder_tag == "none" else vocoder_tag
    elif lang == "English":
        # VITS needs no vocoder; others do
        if tag == "kan-bayashi/libritts_xvector_vits":
            tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
            vocoder = None
        elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
            tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
            vocoder = "melgan"
        else:
            raise ValueError(f"Not supported: lang={lang}, tag={tag}")
    else:
        raise ValueError(f"Not supported: lang={lang}")
    return tts_model, vocoder

tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
tts_model = tts_model.to(device)

vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)

# Gradio part
def synthesize(text):
    with torch.no_grad():
        # Text-to-speech
        wav = tts_model(text)[0]
        if vocoder is not None:
            # Apply vocoder
            wav = vocoder.inference(wav)
        # Convert to numpy array
        wav = wav.squeeze().cpu().numpy()
    return wav

interface = gr.Interface(synthesize, inputs="text", outputs="audio")
interface.launch()