Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import warnings | |
| from contextlib import contextmanager | |
| import gc | |
| import librosa | |
| import soundfile as sf | |
| warnings.filterwarnings("ignore") | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| print("π Starting FINAL CORRECTED Voice Cloning Studio...") | |
| def patch_torch_load(): | |
| original_load = torch.load | |
| def patched_load(f, *args, **kwargs): | |
| kwargs['weights_only'] = False | |
| return original_load(f, *args, **kwargs) | |
| torch.load = patched_load | |
| try: | |
| yield | |
| finally: | |
| torch.load = original_load | |
| # Hardware setup | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"π₯ Device: {DEVICE}") | |
| # Global model variables | |
| TTS_MODEL = None | |
| WHISPER_MODEL = None | |
| MODEL_STATUS = "Not Loaded" | |
| def load_xtts_optimized(): | |
| global TTS_MODEL, MODEL_STATUS | |
| if TTS_MODEL is not None: | |
| return True | |
| try: | |
| with patch_torch_load(): | |
| from TTS.api import TTS | |
| print("π¦ Loading XTTS...") | |
| TTS_MODEL = TTS( | |
| model_name="tts_models/multilingual/multi-dataset/xtts_v2", | |
| progress_bar=False, | |
| gpu=(DEVICE == "cuda") | |
| ) | |
| MODEL_STATUS = "XTTS-v2 Ready" | |
| print("β XTTS loaded successfully!") | |
| return True | |
| except Exception as e: | |
| print(f"β XTTS loading failed: {e}") | |
| MODEL_STATUS = f"XTTS Failed: {str(e)}" | |
| return False | |
| def load_whisper_optimized(): | |
| global WHISPER_MODEL | |
| if WHISPER_MODEL is not None: | |
| return True | |
| try: | |
| import whisper | |
| WHISPER_MODEL = whisper.load_model("base", device=DEVICE) | |
| print("β Whisper loaded!") | |
| return True | |
| except Exception as e: | |
| print(f"β Whisper failed: {e}") | |
| return False | |
| def optimize_audio_input(audio_path, max_duration=25): | |
| try: | |
| if not os.path.exists(audio_path): | |
| print(f"β οΈ Audio file not found: {audio_path}") | |
| return audio_path | |
| audio, sr = librosa.load(audio_path, sr=22050) | |
| max_samples = int(max_duration * sr) | |
| if len(audio) > max_samples: | |
| audio = audio[:max_samples] | |
| print(f"π Audio trimmed to {max_duration}s") | |
| optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav') | |
| sf.write(optimized_path, audio, sr) | |
| print(f"β Audio optimized: {optimized_path}") | |
| return optimized_path | |
| except Exception as e: | |
| print(f"β οΈ Audio optimization failed: {e}") | |
| return audio_path | |
| def safe_file_path(file_input, input_name="audio"): | |
| """Extract file path from various input formats""" | |
| try: | |
| if file_input is None: | |
| return None | |
| # If it's already a string path | |
| if isinstance(file_input, str): | |
| if os.path.exists(file_input): | |
| return file_input | |
| else: | |
| print(f"β οΈ File path doesn't exist: {file_input}") | |
| return None | |
| # If it's a file object with name attribute | |
| if hasattr(file_input, 'name'): | |
| file_path = file_input.name | |
| if file_path and os.path.exists(file_path): | |
| return file_path | |
| # If it's a dict-like object | |
| if hasattr(file_input, 'get'): | |
| file_path = file_input.get('name') or file_input.get('path') | |
| if file_path and os.path.exists(file_path): | |
| return file_path | |
| print(f"β οΈ Could not extract file path from {input_name}: {type(file_input)}") | |
| return None | |
| except Exception as e: | |
| print(f"β Error processing {input_name}: {e}") | |
| return None | |
| def voice_to_voice_clone_final(reference_audio, input_audio, language="en"): | |
| """FINAL CORRECTED voice cloning function""" | |
| try: | |
| print(f"π Voice cloning request: {language}") | |
| print(f"π Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}") | |
| # Extract file paths safely | |
| reference_path = safe_file_path(reference_audio, "reference") | |
| input_path = safe_file_path(input_audio, "input") | |
| if not reference_path: | |
| return None, "β Could not process reference audio file." | |
| if not input_path: | |
| return None, "β Could not process input audio file." | |
| print(f"π Processing files - Ref: {reference_path}, Input: {input_path}") | |
| # Validate files | |
| if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000: | |
| return None, "β Reference audio file is invalid." | |
| if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000: | |
| return None, "β Input audio file is invalid." | |
| # Load models | |
| if not load_xtts_optimized(): | |
| return None, f"β XTTS model failed: {MODEL_STATUS}" | |
| load_whisper_optimized() | |
| # Optimize audio files | |
| print("π Optimizing audio files...") | |
| ref_optimized = optimize_audio_input(reference_path, max_duration=20) | |
| input_optimized = optimize_audio_input(input_path, max_duration=25) | |
| # Transcribe input audio | |
| extracted_text = "This is a voice cloning demonstration." | |
| if WHISPER_MODEL: | |
| try: | |
| print("π€ Transcribing audio...") | |
| with torch.no_grad(): | |
| result = WHISPER_MODEL.transcribe( | |
| input_optimized, | |
| fp16=(DEVICE == "cuda"), | |
| language=language if language != 'auto' else None | |
| ) | |
| text = result.get("text", "").strip() | |
| if text and len(text) > 5: | |
| extracted_text = text[:400] | |
| print(f"β Transcribed: '{extracted_text[:50]}...'") | |
| except Exception as e: | |
| print(f"β οΈ Transcription warning: {e}") | |
| # Generate cloned voice | |
| print("π Generating cloned voice...") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| try: | |
| with patch_torch_load(), torch.no_grad(): | |
| TTS_MODEL.tts_to_file( | |
| text=extracted_text, | |
| speaker_wav=ref_optimized, | |
| language=language, | |
| file_path=output_path, | |
| temperature=0.7, | |
| length_penalty=1.0, | |
| repetition_penalty=5.0 | |
| ) | |
| except Exception as tts_error: | |
| print(f"β TTS generation error: {tts_error}") | |
| return None, f"β Voice generation failed: {str(tts_error)}" | |
| # Memory cleanup | |
| if DEVICE == "cuda": | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| # Validate and return output | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 1000: | |
| file_size_kb = os.path.getsize(output_path) / 1024 | |
| success_message = f"""β VOICE CLONING SUCCESS! π | |
| π Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}" | |
| π Device: {DEVICE} | Model: {MODEL_STATUS} | |
| π Output: {file_size_kb:.1f} KB | Language: {language.upper()} | |
| π§ Optimizations Applied Successfully""" | |
| print("β Voice cloning completed successfully!") | |
| # CRITICAL FIX: Return file path directly for Gradio compatibility | |
| return output_path, success_message | |
| else: | |
| return None, "β Voice cloning failed - output file is empty." | |
| except Exception as e: | |
| error_msg = f"β Voice cloning error: {str(e)}" | |
| print(error_msg) | |
| import traceback | |
| print("Full traceback:", traceback.format_exc()) | |
| return None, error_msg | |
| # CRITICAL: Use gr.Interface (not Blocks) for better API compatibility | |
| interface = gr.Interface( | |
| fn=voice_to_voice_clone_final, | |
| inputs=[ | |
| gr.Audio( | |
| label="π€ Reference Audio (Voice to Clone)", | |
| type="filepath" # CRITICAL: Must be filepath for API compatibility | |
| ), | |
| gr.Audio( | |
| label="π΅ Input Audio (Content to Transform)", | |
| type="filepath" # CRITICAL: Must be filepath for API compatibility | |
| ), | |
| gr.Dropdown( | |
| choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"], | |
| value="en", | |
| label="π Language" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.Audio( | |
| label="π Cloned Voice Result", | |
| type="filepath" # CRITICAL: Must be filepath for proper return | |
| ), | |
| gr.Textbox( | |
| label="π Processing Status", | |
| lines=8 | |
| ) | |
| ], | |
| title="π AI Voice Cloning Studio - FINAL", | |
| description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).", | |
| theme=gr.themes.Soft(), | |
| allow_flagging="never", | |
| api_name="voice_to_voice_clone" # CRITICAL: API endpoint name | |
| ) | |
| if __name__ == "__main__": | |
| print("π Launching FINAL CORRECTED Voice Cloning Studio...") | |
| # CORRECTED: Proper queue configuration | |
| interface.queue( | |
| max_size=2, # Reduced for stability | |
| api_open=True, | |
| default_concurrency_limit=1 | |
| ).launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_api=True, | |
| debug=False # Disable debug for production | |
| ) | |