Spaces:

RamananR
/

Ratan_Tata_TTS_INF

Sleeping

App Files Files Community

Ratan_Tata_TTS_INF / app.py

RamananR

Update app.py

bc9ed41 verified 4 months ago

raw

history blame contribute delete

3.53 kB

	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from speechbrain.pretrained import EncoderClassifier
	from IPython.display import Audio
	from datasets import load_dataset
	import noisereduce as nr
	import soundfile as sf
	import os, torchaudio
	import numpy as np
	import torch
	import streamlit as st
	import uuid
	st.title("Ratan Tata SpeechT5 TTS Demo")

	processor = SpeechT5Processor.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder
	processor.tokenizer.split_special_tokens = True
	model = SpeechT5ForTextToSpeech.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

	device = "cuda" if torch.cuda.is_available() else "cpu"
	speaker_model = EncoderClassifier.from_hparams(
	source=spk_model_name,
	run_opts={"device": device},
	savedir=os.path.join("/tmp", spk_model_name),
	)
	signal, fs =torchaudio.load('default_ratan_tata_voice.wav')
	print(signal, fs)
	# Ensure to detach and clone before converting to tensor if needed
	speaker_embeddings = speaker_model.encode_batch(signal) # Directly passing signal as a tensor, no need to wrap in torch.tensor
	speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # Normalize the embeddings
	speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() # Squeeze and convert to numpy array
	speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) # Convert back to tensor if necessary







	def split_text_by_length(text, max_length=60):#from the paper speech_t5 max char length 120 char "max_length=60"
	# Splits the text into chunks of max_length, preserving words
	words = text.split()
	result = []
	current_line = []

	for word in words:
	# Check if adding the next word exceeds the maximum length
	if len(' '.join(current_line + [word])) > max_length:
	result.append(' '.join(current_line))
	current_line = [word]
	else:
	current_line.append(word)

	# Add the last remaining part
	if current_line:
	result.append(' '.join(current_line))

	return result


	if st.button("Generate Speech"):
	input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
	splited_text=split_text_by_length(input_text,max_length=80)
	print(splited_text)
	all_speech = []

	# Assuming splited_text is already defined
	for i in splited_text:

	inputs = processor(text=i, return_tensors="pt")
	speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
	print(speech_chunk)
	if isinstance(speech_chunk, torch.Tensor):
	speech_chunk = speech_chunk.cpu().numpy()

	# Apply noise reduction to each speech chunk
	reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) # assuming 16kHz sample rate

	all_speech.append(reduced_noise_chunk)

	# Concatenate the noise-reduced speech chunks
	concatenated_speech = np.concatenate(all_speech)

	temp = str(uuid.uuid4())
	output_path = os.path.join("./tmp", f"output_speech_{temp}.wav")
	sf.write(output_path, concatenated_speech, 16000)
	st.audio(output_path)