Spaces:

Emma123453
/

Demo

Runtime error

App Files Files Community

Demo / app.py

Emma123453

Update app.py

2841564 verified 7 months ago

raw

history blame contribute delete

2.91 kB

	import os
	os.environ["COQUI_TOS_AGREED"] = "1"

	import torch
	from torch.serialization import add_safe_globals
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import XttsAudioConfig

	add_safe_globals([XttsConfig, XttsAudioConfig])

	from TTS.api import TTS
	from speechbrain.inference import SpeakerRecognition
	from transformers import pipeline
	import gradio as gr
	import numpy as np
	import soundfile as sf
	from scipy.signal import resample
	from scipy.io.wavfile import write as write_wav
	from tempfile import NamedTemporaryFile

	# Load voice cloning model (XTTS)
	tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)

	# Load spoof detection models
	sb = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model")
	ast_pipe = pipeline("audio-classification", model="MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection")
	df_pipe = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2")

	def spoof_and_detect(voice_sample, desired_sr=16000):
	ref_audio_array, ref_sr = voice_sample

	# Resample to 16kHz
	if ref_sr != desired_sr:
	duration = ref_audio_array.shape[0] / ref_sr
	num_samples = int(duration * desired_sr)
	ref_audio_array = resample(ref_audio_array, num_samples)
	ref_sr = desired_sr

	# Save reference audio
	with NamedTemporaryFile(suffix=".wav", mode='wb', delete=False) as ref_wav:
	ref_temp_path = ref_wav.name
	write_wav(ref_temp_path, ref_sr, ref_audio_array.astype("float32"))

	# Clone voice
	clone_path = ref_temp_path.replace(".wav", "_clone.wav")
	tts.tts_to_file(
	text="My voice is my password.",
	speaker_wav=ref_temp_path,
	file_path=clone_path,
	language="en"
	)

	# Spoof detection
	sb_score, sb_label = sb.verify_files(ref_temp_path, clone_path)
	ast_ref = ast_pipe(ref_temp_path)[0]
	ast_clone = ast_pipe(clone_path)[0]
	df_ref = df_pipe(ref_temp_path)[0]
	df_clone = df_pipe(clone_path)[0]

	results = {
	"SpeechBrain": str(sb_label.item()),
	"AST REF": f"{ast_ref['label']} ({ast_ref['score']:.2f})",
	"AST CLONE": f"{ast_clone['label']} ({ast_clone['score']:.2f})",
	"Deepfake REF": f"{df_ref['label']} ({df_ref['score']:.2f})",
	"Deepfake CLONE": f"{df_clone['label']} ({df_clone['score']:.2f})",
	}

	return ref_temp_path, clone_path, results

	demo = gr.Interface(
	fn=spoof_and_detect,
	inputs=gr.Audio(source="microphone", type="numpy", label="🎤 Record your voice"),
	outputs=[
	gr.Audio(label="🎧 Original"),
	gr.Audio(label="🎧 Cloned"),
	gr.JSON(label="🧪 Spoof Detection Results")
	],
	title="Voice Cloning + Spoof Detection",
	description="Clone a speaker's voice and evaluate with 3 spoof detection models."
	)

	demo.launch()