Spaces:
Running
Running
File size: 2,497 Bytes
454f31a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import streamlit as st
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import torch
import torchaudio
import noisereduce as nr
import numpy as np
import soundfile as sf
# Load models and processor
st.title("Ratan Tata SpeechT5 TTS Demo")
processor = SpeechT5Processor.from_pretrained("checkpoint-60000") # Replace with model folder
model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000") # Replace with model folder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})
# Upload audio file for voice embeddings
uploaded_file = ''
if uploaded_file is not None:
signal, fs = torchaudio.load(uploaded_file)
speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))
# Text input for TTS
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
if st.button("Generate Speech"):
def split_text_by_length(text, max_length=80):
words = text.split()
result = []
current_line = []
for word in words:
if len(' '.join(current_line + [word])) > max_length:
result.append(' '.join(current_line))
current_line = [word]
else:
current_line.append(word)
if current_line:
result.append(' '.join(current_line))
return result
splited_text = split_text_by_length(input_text)
all_speech = []
for i in splited_text:
inputs = processor(text=i, return_tensors="pt")
speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
if isinstance(speech_chunk, torch.Tensor):
speech_chunk = speech_chunk.cpu().numpy()
reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
all_speech.append(reduced_noise_chunk)
concatenated_speech = np.concatenate(all_speech)
# Save the output audio
sf.write("output_speech.wav", concatenated_speech, 16000)
st.audio("output_speech.wav")
st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")
|