RamananR's picture
Create app.py
454f31a verified
raw
history blame
2.5 kB
import streamlit as st
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import torch
import torchaudio
import noisereduce as nr
import numpy as np
import soundfile as sf
# Load models and processor
st.title("Ratan Tata SpeechT5 TTS Demo")
processor = SpeechT5Processor.from_pretrained("checkpoint-60000") # Replace with model folder
model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000") # Replace with model folder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})
# Upload audio file for voice embeddings
uploaded_file = ''
if uploaded_file is not None:
signal, fs = torchaudio.load(uploaded_file)
speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))
# Text input for TTS
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
if st.button("Generate Speech"):
def split_text_by_length(text, max_length=80):
words = text.split()
result = []
current_line = []
for word in words:
if len(' '.join(current_line + [word])) > max_length:
result.append(' '.join(current_line))
current_line = [word]
else:
current_line.append(word)
if current_line:
result.append(' '.join(current_line))
return result
splited_text = split_text_by_length(input_text)
all_speech = []
for i in splited_text:
inputs = processor(text=i, return_tensors="pt")
speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
if isinstance(speech_chunk, torch.Tensor):
speech_chunk = speech_chunk.cpu().numpy()
reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
all_speech.append(reduced_noise_chunk)
concatenated_speech = np.concatenate(all_speech)
# Save the output audio
sf.write("output_speech.wav", concatenated_speech, 16000)
st.audio("output_speech.wav")
st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")