voiceclone-dev / app.py
crackuser's picture
Update app.py
ba703e9 verified
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
import gc
import librosa
import soundfile as sf
warnings.filterwarnings("ignore")
os.environ["COQUI_TOS_AGREED"] = "1"
print("πŸš€ Starting FINAL CORRECTED Voice Cloning Studio...")
@contextmanager
def patch_torch_load():
original_load = torch.load
def patched_load(f, *args, **kwargs):
kwargs['weights_only'] = False
return original_load(f, *args, **kwargs)
torch.load = patched_load
try:
yield
finally:
torch.load = original_load
# Hardware setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸ”₯ Device: {DEVICE}")
# Global model variables
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"
def load_xtts_optimized():
global TTS_MODEL, MODEL_STATUS
if TTS_MODEL is not None:
return True
try:
with patch_torch_load():
from TTS.api import TTS
print("πŸ“¦ Loading XTTS...")
TTS_MODEL = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=False,
gpu=(DEVICE == "cuda")
)
MODEL_STATUS = "XTTS-v2 Ready"
print("βœ… XTTS loaded successfully!")
return True
except Exception as e:
print(f"❌ XTTS loading failed: {e}")
MODEL_STATUS = f"XTTS Failed: {str(e)}"
return False
def load_whisper_optimized():
global WHISPER_MODEL
if WHISPER_MODEL is not None:
return True
try:
import whisper
WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
print("βœ… Whisper loaded!")
return True
except Exception as e:
print(f"❌ Whisper failed: {e}")
return False
def optimize_audio_input(audio_path, max_duration=25):
try:
if not os.path.exists(audio_path):
print(f"⚠️ Audio file not found: {audio_path}")
return audio_path
audio, sr = librosa.load(audio_path, sr=22050)
max_samples = int(max_duration * sr)
if len(audio) > max_samples:
audio = audio[:max_samples]
print(f"πŸ”„ Audio trimmed to {max_duration}s")
optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
sf.write(optimized_path, audio, sr)
print(f"βœ… Audio optimized: {optimized_path}")
return optimized_path
except Exception as e:
print(f"⚠️ Audio optimization failed: {e}")
return audio_path
def safe_file_path(file_input, input_name="audio"):
"""Extract file path from various input formats"""
try:
if file_input is None:
return None
# If it's already a string path
if isinstance(file_input, str):
if os.path.exists(file_input):
return file_input
else:
print(f"⚠️ File path doesn't exist: {file_input}")
return None
# If it's a file object with name attribute
if hasattr(file_input, 'name'):
file_path = file_input.name
if file_path and os.path.exists(file_path):
return file_path
# If it's a dict-like object
if hasattr(file_input, 'get'):
file_path = file_input.get('name') or file_input.get('path')
if file_path and os.path.exists(file_path):
return file_path
print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}")
return None
except Exception as e:
print(f"❌ Error processing {input_name}: {e}")
return None
def voice_to_voice_clone_final(reference_audio, input_audio, language="en"):
"""FINAL CORRECTED voice cloning function"""
try:
print(f"🎭 Voice cloning request: {language}")
print(f"πŸ“ Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
# Extract file paths safely
reference_path = safe_file_path(reference_audio, "reference")
input_path = safe_file_path(input_audio, "input")
if not reference_path:
return None, "❌ Could not process reference audio file."
if not input_path:
return None, "❌ Could not process input audio file."
print(f"πŸ“ Processing files - Ref: {reference_path}, Input: {input_path}")
# Validate files
if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
return None, "❌ Reference audio file is invalid."
if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
return None, "❌ Input audio file is invalid."
# Load models
if not load_xtts_optimized():
return None, f"❌ XTTS model failed: {MODEL_STATUS}"
load_whisper_optimized()
# Optimize audio files
print("πŸ”„ Optimizing audio files...")
ref_optimized = optimize_audio_input(reference_path, max_duration=20)
input_optimized = optimize_audio_input(input_path, max_duration=25)
# Transcribe input audio
extracted_text = "This is a voice cloning demonstration."
if WHISPER_MODEL:
try:
print("🎀 Transcribing audio...")
with torch.no_grad():
result = WHISPER_MODEL.transcribe(
input_optimized,
fp16=(DEVICE == "cuda"),
language=language if language != 'auto' else None
)
text = result.get("text", "").strip()
if text and len(text) > 5:
extracted_text = text[:400]
print(f"βœ… Transcribed: '{extracted_text[:50]}...'")
except Exception as e:
print(f"⚠️ Transcription warning: {e}")
# Generate cloned voice
print("πŸš€ Generating cloned voice...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
try:
with patch_torch_load(), torch.no_grad():
TTS_MODEL.tts_to_file(
text=extracted_text,
speaker_wav=ref_optimized,
language=language,
file_path=output_path,
temperature=0.7,
length_penalty=1.0,
repetition_penalty=5.0
)
except Exception as tts_error:
print(f"❌ TTS generation error: {tts_error}")
return None, f"❌ Voice generation failed: {str(tts_error)}"
# Memory cleanup
if DEVICE == "cuda":
torch.cuda.empty_cache()
gc.collect()
# Validate and return output
if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
file_size_kb = os.path.getsize(output_path) / 1024
success_message = f"""βœ… VOICE CLONING SUCCESS! πŸŽ‰
πŸ“ Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
🎭 Device: {DEVICE} | Model: {MODEL_STATUS}
πŸ“Š Output: {file_size_kb:.1f} KB | Language: {language.upper()}
πŸ”§ Optimizations Applied Successfully"""
print("βœ… Voice cloning completed successfully!")
# CRITICAL FIX: Return file path directly for Gradio compatibility
return output_path, success_message
else:
return None, "❌ Voice cloning failed - output file is empty."
except Exception as e:
error_msg = f"❌ Voice cloning error: {str(e)}"
print(error_msg)
import traceback
print("Full traceback:", traceback.format_exc())
return None, error_msg
# CRITICAL: Use gr.Interface (not Blocks) for better API compatibility
interface = gr.Interface(
fn=voice_to_voice_clone_final,
inputs=[
gr.Audio(
label="🎀 Reference Audio (Voice to Clone)",
type="filepath" # CRITICAL: Must be filepath for API compatibility
),
gr.Audio(
label="🎡 Input Audio (Content to Transform)",
type="filepath" # CRITICAL: Must be filepath for API compatibility
),
gr.Dropdown(
choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
value="en",
label="🌍 Language"
)
],
outputs=[
gr.Audio(
label="πŸŽ‰ Cloned Voice Result",
type="filepath" # CRITICAL: Must be filepath for proper return
),
gr.Textbox(
label="πŸ“‹ Processing Status",
lines=8
)
],
title="🎭 AI Voice Cloning Studio - FINAL",
description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).",
theme=gr.themes.Soft(),
allow_flagging="never",
api_name="voice_to_voice_clone" # CRITICAL: API endpoint name
)
if __name__ == "__main__":
print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...")
# CORRECTED: Proper queue configuration
interface.queue(
max_size=2, # Reduced for stability
api_open=True,
default_concurrency_limit=1
).launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_api=True,
debug=False # Disable debug for production
)