Spaces:

ruslanmv
/

Text-to-Voice

Running

App Files Files Community

Text-to-Voice / app.py

ruslanmv

updates

2dac140 about 1 month ago

raw

history blame

No virus

3.33 kB

	import streamlit as st
	import numpy as np
	import torch
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from io import StringIO
	import soundfile as sf

	# Load models outside of function calls for efficiency
	@st.cache_data
	def load_models():
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	return model, processor, vocoder

	model, processor, vocoder = load_models()

	# Load speaker embeddings
	@st.cache_data
	def get_speaker_embeddings():
	speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
	return torch.tensor(speaker_embeddings).unsqueeze(0)

	speaker_embeddings = get_speaker_embeddings()

	# Improved Styling
	def local_css(file_name):
	with open(file_name) as f:
	st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)

	local_css("style.css")

	# Streamlined Layout
	st.title("Text-to-Voice Conversion")
	st.markdown("Convert your text to speech using advanced AI models.")

	# Function to convert text to speech
	def text_to_speech(text):
	try:
	# Segment the text if it's too long
	max_length = 100 # Set a max length as per model's capability
	segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
	audio_paths = []

	for segment in segments:
	inputs = processor(text=segment, return_tensors="pt")
	spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
	with torch.no_grad():
	speech = vocoder(spectrogram)
	audio_path = f"speech_segment_{len(audio_paths)}.wav"
	sf.write(audio_path, speech.numpy(), samplerate=16000)
	audio_paths.append(audio_path)

	return audio_paths
	except Exception as e:
	st.error(f"Error in text-to-speech conversion: {e}")
	return []

	# Function to combine audio segments
	def combine_audio_segments(paths):
	combined_speech = []
	for path in paths:
	data, samplerate = sf.read(path)
	combined_speech.extend(data)
	sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
	return "combined_speech.wav"

	# Text Input
	text = st.text_area("Type your text or upload a text file below.")

	# Convert Button
	if st.button("Convert"):
	if text:
	audio_paths = text_to_speech(text)
	combined_audio_path = combine_audio_segments(audio_paths)
	audio_file = open(combined_audio_path, 'rb')
	audio_bytes = audio_file.read()
	st.audio(audio_bytes, format='audio/wav')
	else:
	st.error("Please enter some text to convert.")

	# File Uploader
	uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
	if uploaded_file is not None:
	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
	text = stringio.read()
	st.write(text)

	if st.button("Convert Uploaded File", key=1):
	audio_paths = text_to_speech(text)
	combined_audio_path = combine_audio_segments(audio_paths)
	audio_file = open(combined_audio_path, 'rb')
	audio_bytes = audio_file.read()
	st.audio(audio_bytes, format='audio/wav')