Text2Speech / app.py
Abdelmageed95's picture
Update app.py
39fc4cb
import librosa
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import gradio as gr
import librosa
import numpy as np
import torch
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
def predict(text):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
inputs = processor(text=text, return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
speaker_embedding = np.load("cmu_us_ksp_arctic-wav-arctic_b0087.npy")
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Speaker", choices=[
"KSP (male)"
],
value="KSP (male)"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
]
).launch()