Spaces:

RamananR
/

Ratan_Tata_TTS_INF

Running

File size: 2,497 Bytes

454f31a

import streamlit as st
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import torch
import torchaudio
import noisereduce as nr
import numpy as np
import soundfile as sf

# Load models and processor
st.title("Ratan Tata SpeechT5 TTS Demo")

processor = SpeechT5Processor.from_pretrained("checkpoint-60000")  # Replace with model folder
model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000")  # Replace with model folder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})

# Upload audio file for voice embeddings
uploaded_file = ''
if uploaded_file is not None:
    signal, fs = torchaudio.load(uploaded_file)
    speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
    speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))

# Text input for TTS
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
if st.button("Generate Speech"):
    def split_text_by_length(text, max_length=80):
        words = text.split()
        result = []
        current_line = []
        for word in words:
            if len(' '.join(current_line + [word])) > max_length:
                result.append(' '.join(current_line))
                current_line = [word]
            else:
                current_line.append(word)
        if current_line:
            result.append(' '.join(current_line))
        return result

    splited_text = split_text_by_length(input_text)
    all_speech = []
    
    for i in splited_text:
        inputs = processor(text=i, return_tensors="pt")
        speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        if isinstance(speech_chunk, torch.Tensor):
            speech_chunk = speech_chunk.cpu().numpy()

        reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
        all_speech.append(reduced_noise_chunk)

    concatenated_speech = np.concatenate(all_speech)
    
    # Save the output audio
    sf.write("output_speech.wav", concatenated_speech, 16000)
    st.audio("output_speech.wav")

st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")