import gradio as gr import torch from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification from speechbrain.inference import EncoderClassifier import torchaudio # Load the deepfake detection model deepfake_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") deepfake_model = Wav2Vec2ForSequenceClassification.from_pretrained("MelodyMachine/Deepfake-audio-detection-V2") # Load the speaker recognition model using the proper method class SpeakerRecognition: def __init__(self): self.model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb") def encode_batch(self, signal): embeddings = self.model.encode_batch(signal) return embeddings classifier = SpeakerRecognition() def analyze_audio(audio): # Load audio signal, fs = torchaudio.load(audio) signal = signal.mean(dim=0, keepdim=True) # Convert to mono # Deepfake detection deepfake_inputs = deepfake_processor(signal.squeeze().numpy(), sampling_rate=fs, return_tensors="pt", padding="longest").input_values with torch.no_grad(): deepfake_logits = deepfake_model(deepfake_inputs).logits deepfake_probabilities = torch.nn.functional.softmax(deepfake_logits, dim=-1) deepfake_prediction = torch.argmax(deepfake_probabilities, dim=-1).item() # Speaker recognition embeddings = classifier.encode_batch(signal) # Results deepfake_result = "Fake" if deepfake_prediction == 1 else "Real" speaker_embedding = embeddings.squeeze().tolist() return deepfake_result, speaker_embedding # Gradio interface iface = gr.Interface( fn=analyze_audio, inputs=gr.Audio(type="filepath"), outputs=[ gr.Textbox(label="Deepfake Detection"), gr.JSON(label="Speaker Embedding") ], title="Deepfake Audio Detection and Speaker Recognition", description="Upload an audio file to detect if it's a deepfake and to extract speaker embeddings." ) if __name__ == "__main__": iface.launch()