Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

voiceclone-dev / app.py

crackuser

Update app.py

ba703e9 verified 4 months ago

raw

history blame contribute delete

9.81 kB

	import gradio as gr
	import torch
	import torchaudio
	import tempfile
	import os
	import warnings
	from contextlib import contextmanager
	import gc
	import librosa
	import soundfile as sf

	warnings.filterwarnings("ignore")
	os.environ["COQUI_TOS_AGREED"] = "1"
	print("🚀 Starting FINAL CORRECTED Voice Cloning Studio...")

	@contextmanager
	def patch_torch_load():
	original_load = torch.load
	def patched_load(f, args, *kwargs):
	kwargs['weights_only'] = False
	return original_load(f, args, *kwargs)
	torch.load = patched_load
	try:
	yield
	finally:
	torch.load = original_load

	# Hardware setup
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"🔥 Device: {DEVICE}")

	# Global model variables
	TTS_MODEL = None
	WHISPER_MODEL = None
	MODEL_STATUS = "Not Loaded"

	def load_xtts_optimized():
	global TTS_MODEL, MODEL_STATUS
	if TTS_MODEL is not None:
	return True
	try:
	with patch_torch_load():
	from TTS.api import TTS
	print("📦 Loading XTTS...")
	TTS_MODEL = TTS(
	model_name="tts_models/multilingual/multi-dataset/xtts_v2",
	progress_bar=False,
	gpu=(DEVICE == "cuda")
	)
	MODEL_STATUS = "XTTS-v2 Ready"
	print("✅ XTTS loaded successfully!")
	return True
	except Exception as e:
	print(f"❌ XTTS loading failed: {e}")
	MODEL_STATUS = f"XTTS Failed: {str(e)}"
	return False

	def load_whisper_optimized():
	global WHISPER_MODEL
	if WHISPER_MODEL is not None:
	return True
	try:
	import whisper
	WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
	print("✅ Whisper loaded!")
	return True
	except Exception as e:
	print(f"❌ Whisper failed: {e}")
	return False

	def optimize_audio_input(audio_path, max_duration=25):
	try:
	if not os.path.exists(audio_path):
	print(f"⚠️ Audio file not found: {audio_path}")
	return audio_path

	audio, sr = librosa.load(audio_path, sr=22050)
	max_samples = int(max_duration * sr)
	if len(audio) > max_samples:
	audio = audio[:max_samples]
	print(f"🔄 Audio trimmed to {max_duration}s")

	optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
	sf.write(optimized_path, audio, sr)
	print(f"✅ Audio optimized: {optimized_path}")
	return optimized_path

	except Exception as e:
	print(f"⚠️ Audio optimization failed: {e}")
	return audio_path

	def safe_file_path(file_input, input_name="audio"):
	"""Extract file path from various input formats"""
	try:
	if file_input is None:
	return None

	# If it's already a string path
	if isinstance(file_input, str):
	if os.path.exists(file_input):
	return file_input
	else:
	print(f"⚠️ File path doesn't exist: {file_input}")
	return None

	# If it's a file object with name attribute
	if hasattr(file_input, 'name'):
	file_path = file_input.name
	if file_path and os.path.exists(file_path):
	return file_path

	# If it's a dict-like object
	if hasattr(file_input, 'get'):
	file_path = file_input.get('name') or file_input.get('path')
	if file_path and os.path.exists(file_path):
	return file_path

	print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}")
	return None

	except Exception as e:
	print(f"❌ Error processing {input_name}: {e}")
	return None

	def voice_to_voice_clone_final(reference_audio, input_audio, language="en"):
	"""FINAL CORRECTED voice cloning function"""
	try:
	print(f"🎭 Voice cloning request: {language}")
	print(f"📁 Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")

	# Extract file paths safely
	reference_path = safe_file_path(reference_audio, "reference")
	input_path = safe_file_path(input_audio, "input")

	if not reference_path:
	return None, "❌ Could not process reference audio file."

	if not input_path:
	return None, "❌ Could not process input audio file."

	print(f"📁 Processing files - Ref: {reference_path}, Input: {input_path}")

	# Validate files
	if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
	return None, "❌ Reference audio file is invalid."

	if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
	return None, "❌ Input audio file is invalid."

	# Load models
	if not load_xtts_optimized():
	return None, f"❌ XTTS model failed: {MODEL_STATUS}"

	load_whisper_optimized()

	# Optimize audio files
	print("🔄 Optimizing audio files...")
	ref_optimized = optimize_audio_input(reference_path, max_duration=20)
	input_optimized = optimize_audio_input(input_path, max_duration=25)

	# Transcribe input audio
	extracted_text = "This is a voice cloning demonstration."
	if WHISPER_MODEL:
	try:
	print("🎤 Transcribing audio...")
	with torch.no_grad():
	result = WHISPER_MODEL.transcribe(
	input_optimized,
	fp16=(DEVICE == "cuda"),
	language=language if language != 'auto' else None
	)
	text = result.get("text", "").strip()
	if text and len(text) > 5:
	extracted_text = text[:400]
	print(f"✅ Transcribed: '{extracted_text[:50]}...'")
	except Exception as e:
	print(f"⚠️ Transcription warning: {e}")

	# Generate cloned voice
	print("🚀 Generating cloned voice...")

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	output_path = tmp_file.name

	try:
	with patch_torch_load(), torch.no_grad():
	TTS_MODEL.tts_to_file(
	text=extracted_text,
	speaker_wav=ref_optimized,
	language=language,
	file_path=output_path,
	temperature=0.7,
	length_penalty=1.0,
	repetition_penalty=5.0
	)
	except Exception as tts_error:
	print(f"❌ TTS generation error: {tts_error}")
	return None, f"❌ Voice generation failed: {str(tts_error)}"

	# Memory cleanup
	if DEVICE == "cuda":
	torch.cuda.empty_cache()
	gc.collect()

	# Validate and return output
	if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
	file_size_kb = os.path.getsize(output_path) / 1024

	success_message = f"""✅ VOICE CLONING SUCCESS! 🎉

	📝 Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
	🎭 Device: {DEVICE} \| Model: {MODEL_STATUS}
	📊 Output: {file_size_kb:.1f} KB \| Language: {language.upper()}
	🔧 Optimizations Applied Successfully"""

	print("✅ Voice cloning completed successfully!")

	# CRITICAL FIX: Return file path directly for Gradio compatibility
	return output_path, success_message

	else:
	return None, "❌ Voice cloning failed - output file is empty."

	except Exception as e:
	error_msg = f"❌ Voice cloning error: {str(e)}"
	print(error_msg)
	import traceback
	print("Full traceback:", traceback.format_exc())
	return None, error_msg

	# CRITICAL: Use gr.Interface (not Blocks) for better API compatibility
	interface = gr.Interface(
	fn=voice_to_voice_clone_final,
	inputs=[
	gr.Audio(
	label="🎤 Reference Audio (Voice to Clone)",
	type="filepath" # CRITICAL: Must be filepath for API compatibility
	),
	gr.Audio(
	label="🎵 Input Audio (Content to Transform)",
	type="filepath" # CRITICAL: Must be filepath for API compatibility
	),
	gr.Dropdown(
	choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
	value="en",
	label="🌍 Language"
	)
	],
	outputs=[
	gr.Audio(
	label="🎉 Cloned Voice Result",
	type="filepath" # CRITICAL: Must be filepath for proper return
	),
	gr.Textbox(
	label="📋 Processing Status",
	lines=8
	)
	],
	title="🎭 AI Voice Cloning Studio - FINAL",
	description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).",
	theme=gr.themes.Soft(),
	allow_flagging="never",
	api_name="voice_to_voice_clone" # CRITICAL: API endpoint name
	)

	if __name__ == "__main__":
	print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...")

	# CORRECTED: Proper queue configuration
	interface.queue(
	max_size=2, # Reduced for stability
	api_open=True,
	default_concurrency_limit=1
	).launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_api=True,
	debug=False # Disable debug for production
	)