Spaces:

RamananR
/

Ratan_Tata_TTS_INF

Sleeping

App Files Files Community

Ratan_Tata_TTS_INF / app.py

RamananR

Create app.py

454f31a verified 3 months ago

raw

history blame

2.5 kB

	import streamlit as st
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from speechbrain.pretrained import EncoderClassifier
	import torch
	import torchaudio
	import noisereduce as nr
	import numpy as np
	import soundfile as sf

	# Load models and processor
	st.title("Ratan Tata SpeechT5 TTS Demo")

	processor = SpeechT5Processor.from_pretrained("checkpoint-60000") # Replace with model folder
	model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000") # Replace with model folder
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
	device = "cuda" if torch.cuda.is_available() else "cpu"
	speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device})

	# Upload audio file for voice embeddings
	uploaded_file = ''
	if uploaded_file is not None:
	signal, fs = torchaudio.load(uploaded_file)
	speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy()
	speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))

	# Text input for TTS
	input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
	if st.button("Generate Speech"):
	def split_text_by_length(text, max_length=80):
	words = text.split()
	result = []
	current_line = []
	for word in words:
	if len(' '.join(current_line + [word])) > max_length:
	result.append(' '.join(current_line))
	current_line = [word]
	else:
	current_line.append(word)
	if current_line:
	result.append(' '.join(current_line))
	return result

	splited_text = split_text_by_length(input_text)
	all_speech = []

	for i in splited_text:
	inputs = processor(text=i, return_tensors="pt")
	speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
	if isinstance(speech_chunk, torch.Tensor):
	speech_chunk = speech_chunk.cpu().numpy()

	reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)
	all_speech.append(reduced_noise_chunk)

	concatenated_speech = np.concatenate(all_speech)

	# Save the output audio
	sf.write("output_speech.wav", concatenated_speech, 16000)
	st.audio("output_speech.wav")

	st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")