|
import gradio as gr |
|
import torch |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
|
from speechbrain.inference import EncoderClassifier |
|
import torchaudio |
|
|
|
|
|
deepfake_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") |
|
deepfake_model = Wav2Vec2ForSequenceClassification.from_pretrained("MelodyMachine/Deepfake-audio-detection-V2") |
|
|
|
|
|
class SpeakerRecognition: |
|
def __init__(self): |
|
self.model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb") |
|
|
|
def encode_batch(self, signal): |
|
embeddings = self.model.encode_batch(signal) |
|
return embeddings |
|
|
|
classifier = SpeakerRecognition() |
|
|
|
def analyze_audio(audio): |
|
|
|
signal, fs = torchaudio.load(audio) |
|
signal = signal.mean(dim=0, keepdim=True) |
|
|
|
|
|
deepfake_inputs = deepfake_processor(signal.squeeze().numpy(), sampling_rate=fs, return_tensors="pt", padding="longest").input_values |
|
with torch.no_grad(): |
|
deepfake_logits = deepfake_model(deepfake_inputs).logits |
|
deepfake_probabilities = torch.nn.functional.softmax(deepfake_logits, dim=-1) |
|
deepfake_prediction = torch.argmax(deepfake_probabilities, dim=-1).item() |
|
|
|
|
|
embeddings = classifier.encode_batch(signal) |
|
|
|
|
|
deepfake_result = "Fake" if deepfake_prediction == 1 else "Real" |
|
speaker_embedding = embeddings.squeeze().tolist() |
|
return deepfake_result, speaker_embedding |
|
|
|
|
|
iface = gr.Interface( |
|
fn=analyze_audio, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs=[ |
|
gr.Textbox(label="Deepfake Detection"), |
|
gr.JSON(label="Speaker Embedding") |
|
], |
|
title="Deepfake Audio Detection and Speaker Recognition", |
|
description="Upload an audio file to detect if it's a deepfake and to extract speaker embeddings." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|