vits-japanese / app.py
quyanh's picture
modify
4bb1137
raw
history blame
2.22 kB
import base64
import torch
import io
import tempfile
import scipy.io.wavfile as wavfile
import commons
import utils
import gradio as gr
import numpy as np
from PIL import Image
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def text_to_speech(text):
stn_tst = get_text(text, hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.2)[0][
0, 0].data.float().numpy()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
wavfile.write(f.name, hps.data.sampling_rate, audio)
audio_file = f.name
# Return the audio file path
return audio_file
# Load the trained model
hps = utils.get_hparams_from_file("./configs/jp_base.json")
hps.model_dir = './logs/jp_base'
pretrained_model = f'{hps.model_dir}/model.pth'
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint(pretrained_model, net_g, None)
# Define the function that will be used to generate speech from text
def generate_speech(text):
# Use the text_to_speech function to generate speech from text
speech = text_to_speech(text)
# Return the speech as a dictionary with 'audio' as the key
# return {'audio': speech}
return speech
# Define the interface for the text-to-speech model
text_input = gr.inputs.Textbox(label='Enter Text Here')
output_audio = gr.outputs.Audio(label='Speech', type='filepath')
# Define the user interface using Gradio
ui = gr.Interface(
fn=generate_speech,
inputs=text_input,
outputs=output_audio,
title='Text-to-Speech Demo',
description='Generate speech from text using a text-to-speech model.'
)
# Run the interface
ui.launch()