Spaces:

RamananR
/

Ratan_Tata_TTS_INF

Sleeping

App Files Files Community

RamananR commited on Oct 16

Commit

454f31a

•

1 Parent(s): f8be69d

Create app.py

Browse files

Files changed (1) hide show

app.py +63 -0

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import streamlit as st
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from speechbrain.pretrained import EncoderClassifier
+import torch
+import torchaudio
+import noisereduce as nr
+import numpy as np
+import soundfile as sf
+# Load models and processor
+st.title("Ratan Tata SpeechT5 TTS Demo")
+processor = SpeechT5Processor.from_pretrained("checkpoint-60000")  # Replace with model folder
+model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000")  # Replace with model folder
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})
+# Upload audio file for voice embeddings
+uploaded_file = ''
+if uploaded_file is not None:
+    signal, fs = torchaudio.load(uploaded_file)
+    speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
+    speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))
+# Text input for TTS
+input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
+if st.button("Generate Speech"):
+    def split_text_by_length(text, max_length=80):
+        words = text.split()
+        result = []
+        current_line = []
+        for word in words:
+            if len(' '.join(current_line + [word])) > max_length:
+                result.append(' '.join(current_line))
+                current_line = [word]
+            else:
+                current_line.append(word)
+        if current_line:
+            result.append(' '.join(current_line))
+        return result
+    splited_text = split_text_by_length(input_text)
+    all_speech = []
+    for i in splited_text:
+        inputs = processor(text=i, return_tensors="pt")
+        speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+        if isinstance(speech_chunk, torch.Tensor):
+            speech_chunk = speech_chunk.cpu().numpy()
+        reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
+        all_speech.append(reduced_noise_chunk)
+    concatenated_speech = np.concatenate(all_speech)
+    # Save the output audio
+    sf.write("output_speech.wav", concatenated_speech, 16000)
+    st.audio("output_speech.wav")
+st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")