Spaces:
Runtime error
Runtime error
| import os | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| import torch | |
| from torch.serialization import add_safe_globals | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| from TTS.tts.models.xtts import XttsAudioConfig | |
| add_safe_globals([XttsConfig, XttsAudioConfig]) | |
| from TTS.api import TTS | |
| from speechbrain.inference import SpeakerRecognition | |
| from transformers import pipeline | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| from scipy.signal import resample | |
| from scipy.io.wavfile import write as write_wav | |
| from tempfile import NamedTemporaryFile | |
| # Load voice cloning model (XTTS) | |
| tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False) | |
| # Load spoof detection models | |
| sb = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model") | |
| ast_pipe = pipeline("audio-classification", model="MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection") | |
| df_pipe = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2") | |
| def spoof_and_detect(voice_sample, desired_sr=16000): | |
| ref_audio_array, ref_sr = voice_sample | |
| # Resample to 16kHz | |
| if ref_sr != desired_sr: | |
| duration = ref_audio_array.shape[0] / ref_sr | |
| num_samples = int(duration * desired_sr) | |
| ref_audio_array = resample(ref_audio_array, num_samples) | |
| ref_sr = desired_sr | |
| # Save reference audio | |
| with NamedTemporaryFile(suffix=".wav", mode='wb', delete=False) as ref_wav: | |
| ref_temp_path = ref_wav.name | |
| write_wav(ref_temp_path, ref_sr, ref_audio_array.astype("float32")) | |
| # Clone voice | |
| clone_path = ref_temp_path.replace(".wav", "_clone.wav") | |
| tts.tts_to_file( | |
| text="My voice is my password.", | |
| speaker_wav=ref_temp_path, | |
| file_path=clone_path, | |
| language="en" | |
| ) | |
| # Spoof detection | |
| sb_score, sb_label = sb.verify_files(ref_temp_path, clone_path) | |
| ast_ref = ast_pipe(ref_temp_path)[0] | |
| ast_clone = ast_pipe(clone_path)[0] | |
| df_ref = df_pipe(ref_temp_path)[0] | |
| df_clone = df_pipe(clone_path)[0] | |
| results = { | |
| "SpeechBrain": str(sb_label.item()), | |
| "AST REF": f"{ast_ref['label']} ({ast_ref['score']:.2f})", | |
| "AST CLONE": f"{ast_clone['label']} ({ast_clone['score']:.2f})", | |
| "Deepfake REF": f"{df_ref['label']} ({df_ref['score']:.2f})", | |
| "Deepfake CLONE": f"{df_clone['label']} ({df_clone['score']:.2f})", | |
| } | |
| return ref_temp_path, clone_path, results | |
| demo = gr.Interface( | |
| fn=spoof_and_detect, | |
| inputs=gr.Audio(source="microphone", type="numpy", label="π€ Record your voice"), | |
| outputs=[ | |
| gr.Audio(label="π§ Original"), | |
| gr.Audio(label="π§ Cloned"), | |
| gr.JSON(label="π§ͺ Spoof Detection Results") | |
| ], | |
| title="Voice Cloning + Spoof Detection", | |
| description="Clone a speaker's voice and evaluate with 3 spoof detection models." | |
| ) | |
| demo.launch() |