File size: 2,497 Bytes
454f31a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import streamlit as st
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import torch
import torchaudio
import noisereduce as nr
import numpy as np
import soundfile as sf

# Load models and processor
st.title("Ratan Tata SpeechT5 TTS Demo")

processor = SpeechT5Processor.from_pretrained("checkpoint-60000")  # Replace with model folder
model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000")  # Replace with model folder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})

# Upload audio file for voice embeddings
uploaded_file = ''
if uploaded_file is not None:
    signal, fs = torchaudio.load(uploaded_file)
    speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
    speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))

# Text input for TTS
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
if st.button("Generate Speech"):
    def split_text_by_length(text, max_length=80):
        words = text.split()
        result = []
        current_line = []
        for word in words:
            if len(' '.join(current_line + [word])) > max_length:
                result.append(' '.join(current_line))
                current_line = [word]
            else:
                current_line.append(word)
        if current_line:
            result.append(' '.join(current_line))
        return result

    splited_text = split_text_by_length(input_text)
    all_speech = []
    
    for i in splited_text:
        inputs = processor(text=i, return_tensors="pt")
        speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        if isinstance(speech_chunk, torch.Tensor):
            speech_chunk = speech_chunk.cpu().numpy()

        reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
        all_speech.append(reduced_noise_chunk)

    concatenated_speech = np.concatenate(all_speech)
    
    # Save the output audio
    sf.write("output_speech.wav", concatenated_speech, 16000)
    st.audio("output_speech.wav")

st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")