Browse files
@@ -0,0 +1,63 @@
1 |
import streamlit as st
2 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3 |
from speechbrain.pretrained import EncoderClassifier
4 |
import torch
5 |
import torchaudio
6 |
import noisereduce as nr
7 |
import numpy as np
8 |
import soundfile as sf
9 |
10 |
# Load models and processor
11 |
st.title("Ratan Tata SpeechT5 TTS Demo")
12 |
13 |
processor = SpeechT5Processor.from_pretrained("checkpoint-60000") # Replace with model folder
14 |
model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000") # Replace with model folder
15 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16 |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
17 |
device = "cuda" if torch.cuda.is_available() else "cpu"
18 |
speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})
19 |
20 |
# Upload audio file for voice embeddings
21 |
uploaded_file = ''
22 |
if uploaded_file is not None:
23 |
signal, fs = torchaudio.load(uploaded_file)
24 |
speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
25 |
speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))
26 |
27 |
# Text input for TTS
28 |
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
29 |
if st.button("Generate Speech"):
30 |
def split_text_by_length(text, max_length=80):
31 |
words = text.split()
32 |
result = []
33 |
current_line = []
34 |
for word in words:
35 |
if len(' '.join(current_line + [word])) > max_length:
36 |
result.append(' '.join(current_line))
37 |
current_line = [word]
38 |
39 |
40 |
if current_line:
41 |
result.append(' '.join(current_line))
42 |
return result
43 |
44 |
splited_text = split_text_by_length(input_text)
45 |
all_speech = []
46 |
47 |
for i in splited_text:
48 |
inputs = processor(text=i, return_tensors="pt")
49 |
speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
50 |
if isinstance(speech_chunk, torch.Tensor):
51 |
speech_chunk = speech_chunk.cpu().numpy()
52 |
53 |
reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
54 |
55 |
56 |
concatenated_speech = np.concatenate(all_speech)
57 |
58 |
# Save the output audio
59 |
sf.write("output_speech.wav", concatenated_speech, 16000)
60 |
61 |
62 |
st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")
63 |