RamananR commited on
Commit
454f31a
1 Parent(s): f8be69d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
+ from speechbrain.pretrained import EncoderClassifier
4
+ import torch
5
+ import torchaudio
6
+ import noisereduce as nr
7
+ import numpy as np
8
+ import soundfile as sf
9
+
10
+ # Load models and processor
11
+ st.title("Ratan Tata SpeechT5 TTS Demo")
12
+
13
+ processor = SpeechT5Processor.from_pretrained("checkpoint-60000") # Replace with model folder
14
+ model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000") # Replace with model folder
15
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})
19
+
20
+ # Upload audio file for voice embeddings
21
+ uploaded_file = ''
22
+ if uploaded_file is not None:
23
+ signal, fs = torchaudio.load(uploaded_file)
24
+ speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
25
+ speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))
26
+
27
+ # Text input for TTS
28
+ input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
29
+ if st.button("Generate Speech"):
30
+ def split_text_by_length(text, max_length=80):
31
+ words = text.split()
32
+ result = []
33
+ current_line = []
34
+ for word in words:
35
+ if len(' '.join(current_line + [word])) > max_length:
36
+ result.append(' '.join(current_line))
37
+ current_line = [word]
38
+ else:
39
+ current_line.append(word)
40
+ if current_line:
41
+ result.append(' '.join(current_line))
42
+ return result
43
+
44
+ splited_text = split_text_by_length(input_text)
45
+ all_speech = []
46
+
47
+ for i in splited_text:
48
+ inputs = processor(text=i, return_tensors="pt")
49
+ speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
50
+ if isinstance(speech_chunk, torch.Tensor):
51
+ speech_chunk = speech_chunk.cpu().numpy()
52
+
53
+ reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
54
+ all_speech.append(reduced_noise_chunk)
55
+
56
+ concatenated_speech = np.concatenate(all_speech)
57
+
58
+ # Save the output audio
59
+ sf.write("output_speech.wav", concatenated_speech, 16000)
60
+ st.audio("output_speech.wav")
61
+
62
+ st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")
63
+