Spaces:

drewThomasson
/

OuteTTS-DEMO

Runtime error

App Files Files Community

OuteTTS-DEMO / app.py

drewThomasson

Update app.py

153c25e verified 6 months ago

raw

history blame

7.35 kB

	import gradio as gr
	from outetts.v0_1.interface import InterfaceHF
	import logging
	import os

	# Configure logging to display information in the terminal
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize the OuteTTS interface with the Hugging Face model
	try:
	logger.info("Initializing OuteTTS InterfaceHF with model 'OuteAI/OuteTTS-0.1-350M'")
	interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
	logger.info("Model loaded successfully.")
	except Exception as e:
	logger.error(f"Failed to load model: {e}")
	raise e

	def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
	"""
	Generates speech from the input text using the OuteTTS model.

	Parameters:
	text (str): The input text for TTS.
	temperature (float): Sampling temperature.
	repetition_penalty (float): Repetition penalty.
	max_length (int): Maximum length of the generated audio tokens.
	speaker (dict): Speaker configuration for voice cloning.

	Returns:
	str: Path to the generated audio file.
	"""
	logger.info("Received TTS generation request.")
	logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker: {speaker is not None}")

	try:
	# Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
	output = interface.generate(
	text=text,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	max_lenght=max_length, # Pass the parameter with typo
	speaker=speaker
	)
	logger.info("TTS generation complete.")

	# Save the output to a temporary WAV file
	output_path = "output.wav"
	output.save(output_path)
	logger.info(f"Audio saved to {output_path}")

	return output_path # Gradio will handle the audio playback
	except Exception as e:
	logger.error(f"Error during TTS generation: {e}")
	return None

	def create_speaker(audio_file, transcript):
	"""
	Creates a custom speaker from a reference audio file and transcript.

	Parameters:
	audio_file (file): Path to the reference audio file.
	transcript (str): The transcript matching the audio.

	Returns:
	dict: Speaker configuration.
	"""
	logger.info("Received Voice Cloning request.")
	logger.info(f"Reference Audio: {audio_file.name}, Transcript: {transcript}")

	try:
	speaker = interface.create_speaker(audio_file.name, transcript)
	logger.info("Speaker created successfully.")
	return speaker
	except Exception as e:
	logger.error(f"Error during speaker creation: {e}")
	return None

	# Define the Gradio Blocks interface
	with gr.Blocks() as demo:
	gr.Markdown("# 🎤 OuteTTS - Text to Speech Interface")
	gr.Markdown(
	"""
	Generate speech from text using the OuteTTS-0.1-350M model.

	Key Features:
	- Pure language modeling approach to TTS
	- Voice cloning capabilities
	- Compatible with LLaMa architecture
	"""
	)

	with gr.Tab("Basic TTS"):
	with gr.Row():
	text_input = gr.Textbox(
	label="📄 Text Input",
	placeholder="Enter the text for TTS generation",
	lines=3
	)

	with gr.Row():
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.1,
	step=0.01,
	label="🌡️ Temperature"
	)
	repetition_penalty = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.1,
	step=0.1,
	label="🔁 Repetition Penalty"
	)
	max_length = gr.Slider(
	minimum=256,
	maximum=4096,
	value=1024,
	step=256,
	label="📏 Max Length"
	)

	generate_button = gr.Button("🔊 Generate Speech")

	output_audio = gr.Audio(
	label="🎧 Generated Speech",
	type="filepath" # Expecting a file path to the audio
	)

	# Define the button click event for Basic TTS
	generate_button.click(
	fn=generate_tts,
	inputs=[text_input, temperature, repetition_penalty, max_length, None],
	outputs=output_audio
	)

	with gr.Tab("Voice Cloning"):
	with gr.Row():
	reference_audio = gr.Audio(
	label="🔊 Reference Audio",
	type="filepath",
	source="upload",
	optional=False
	)
	reference_transcript = gr.Textbox(
	label="📝 Transcript",
	placeholder="Enter the transcript matching the reference audio",
	lines=2
	)

	create_speaker_button = gr.Button("🎤 Create Speaker")

	speaker_info = gr.JSON(label="🗂️ Speaker Configuration")

	generate_cloned_speech = gr.Textbox(
	label="📄 Text Input",
	placeholder="Enter the text for TTS generation with cloned voice",
	lines=3
	)

	with gr.Row():
	temperature_clone = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.1,
	step=0.01,
	label="🌡️ Temperature"
	)
	repetition_penalty_clone = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.1,
	step=0.1,
	label="🔁 Repetition Penalty"
	)
	max_length_clone = gr.Slider(
	minimum=256,
	maximum=4096,
	value=1024,
	step=256,
	label="📏 Max Length"
	)

	generate_cloned_button = gr.Button("🔊 Generate Cloned Speech")

	output_cloned_audio = gr.Audio(
	label="🎧 Generated Cloned Speech",
	type="filepath" # Expecting a file path to the audio
	)

	# Define the button click event for creating a speaker
	create_speaker_button.click(
	fn=create_speaker,
	inputs=[reference_audio, reference_transcript],
	outputs=speaker_info
	)

	# Define the button click event for generating speech with the cloned voice
	generate_cloned_button.click(
	fn=generate_tts,
	inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info],
	outputs=output_cloned_audio
	)

	gr.Markdown(
	"""
	---
	Technical Blog: [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)

	Credits:
	- [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
	- [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
	"""
	)

	# Launch the Gradio app
	if __name__ == "__main__":
	demo.launch()