Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

App Files Files Community

Voice_Cloning_with_OuteTTS / app.py

Chillarmo

Update app.py

cc2340f verified about 2 months ago

raw

history blame

7.51 kB

	import gradio as gr
	import torch
	from outetts.v0_1.interface import InterfaceGGUF
	import soundfile as sf
	import tempfile
	import os
	from faster_whisper import WhisperModel
	import huggingface_hub

	def download_model():
	"""Download the GGUF model from HuggingFace"""
	model_path = huggingface_hub.hf_hub_download(
	repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
	filename="outetts-0.1-350m.gguf"
	)
	return model_path

	def initialize_models():
	"""Initialize the OuteTTS and Faster-Whisper models"""
	# Download and initialize GGUF model with adjusted parameters
	model_path = download_model()
	tts_interface = InterfaceGGUF(
	model_path,
	n_ctx=2048, # Reduced context size
	n_batch=512, # Reduced batch size
	n_threads=4, # Adjust based on CPU
	verbose=False, # Reduce logging
	)

	# Initialize Whisper
	asr_model = WhisperModel("tiny",
	device="cpu",
	compute_type="int8",
	num_workers=1,
	cpu_threads=1)
	return tts_interface, asr_model

	# Initialize models globally to avoid reloading
	try:
	TTS_INTERFACE, ASR_MODEL = initialize_models()
	except Exception as e:
	print(f"Error initializing models: {str(e)}")
	raise

	def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
	"""Process the audio file and generate speech with the cloned voice"""
	try:
	# If no reference text provided, transcribe the audio
	if not reference_text.strip():
	gr.Info("Transcribing audio...")
	reference_text = transcribe_audio(audio_path)
	if reference_text.startswith("Error"):
	return None, reference_text

	gr.Info(f"Using reference text: {reference_text}")

	# Limit text lengths to prevent context overflow
	reference_text = reference_text[:2000] # Further reduced
	text_to_speak = text_to_speak[:300] # Further reduced

	# Create speaker from reference audio
	speaker = TTS_INTERFACE.create_speaker(
	audio_path,
	reference_text,
	)

	# Generate speech with cloned voice
	output = TTS_INTERFACE.generate(
	text=text_to_speak,
	speaker=speaker,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	max_lenght=1024 # Reduced from 2048
	)

	# Save to temporary file and return path
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
	output.save(temp_file.name)
	return temp_file.name, f"""Processing complete!
	Reference text: {reference_text[:300]}...
	(Showing first 300 characters of reference text)"""

	except Exception as e:
	return None, f"Error: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
	gr.Markdown("# 🎙️ Voice Cloning with OuteTTS (GGUF)")
	gr.Markdown("""
	This app uses the GGUF version of OuteTTS optimized for CPU performance. Upload a reference audio file,
	provide the text being spoken in that audio (or leave blank for automatic transcription),
	and enter the new text you want to be spoken in the cloned voice.

	Note:
	- For best results, use clear audio with minimal background noise
	- Reference text is limited to 2000 characters
	- Output text is limited to 300 characters
	- Short inputs work best for quality results
	""")

	with gr.Row():
	with gr.Column():
	# Input components
	audio_input = gr.Audio(
	label="Upload Reference Audio",
	type="filepath",
	max_length=30 # Limit audio length to 30 seconds
	)
	with gr.Row():
	transcribe_btn = gr.Button("📝 Transcribe Audio", variant="secondary")

	reference_text = gr.Textbox(
	label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
	placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
	lines=3,
	max_lines=5
	)
	text_to_speak = gr.Textbox(
	label="Text to Speak (what you want the cloned voice to say, max 300 characters)",
	placeholder="Enter the text you want the cloned voice to speak (keep it short for best results)",
	lines=3,
	max_lines=5
	)

	with gr.Row():
	temperature = gr.Slider(
	minimum=0.1,
	maximum=0.5, # Reduced maximum temperature
	value=0.1,
	step=0.05,
	label="Temperature (keep low for stability)"
	)
	repetition_penalty = gr.Slider(
	minimum=1.0,
	maximum=1.3, # Reduced maximum
	value=1.1,
	step=0.05,
	label="Repetition Penalty"
	)

	# Submit button
	submit_btn = gr.Button("🎙️ Generate Voice", variant="primary")

	with gr.Column():
	# Output components
	output_audio = gr.Audio(label="Generated Speech")
	output_message = gr.Textbox(label="Status", lines=4)

	# Add warning about processing time
	gr.Markdown("""
	⚠️ Note: Initial processing may take a few moments. Please be patient.
	""")

	# Handle transcription button
	def transcribe_audio(audio_path):
	"""Transcribe audio using Faster-Whisper tiny"""
	try:
	if not audio_path:
	return "Please upload audio first."

	segments, _ = ASR_MODEL.transcribe(
	audio_path,
	beam_size=1,
	best_of=1,
	temperature=1.0,
	condition_on_previous_text=False,
	compression_ratio_threshold=2.4,
	log_prob_threshold=-1.0,
	no_speech_threshold=0.6
	)

	text = " ".join([segment.text for segment in segments]).strip()
	return text[:2000] # Limit transcription length
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	transcribe_btn.click(
	fn=transcribe_audio,
	inputs=[audio_input],
	outputs=[reference_text],
	)

	# Handle main generation
	submit_btn.click(
	fn=process_audio_file,
	inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
	outputs=[output_audio, output_message]
	)

	gr.Markdown("""
	### Tips for best results:
	1. Use clear, short audio samples (5-15 seconds is ideal)
	2. Keep both reference and output text concise
	3. Use lower temperature (0.1-0.2) for more stable output
	4. Start with short phrases to test the voice
	5. If generation fails, try:
	- Using shorter text
	- Reducing temperature
	- Using clearer audio
	- Simplifying the text
	""")

	if __name__ == "__main__":
	demo.launch()