Yulufu's picture
edit def analyze_audio
f311efb verified
raw
history blame contribute delete
No virus
2.08 kB
import gradio as gr
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from speechbrain.inference import EncoderClassifier
import torchaudio
# Load the deepfake detection model
deepfake_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
deepfake_model = Wav2Vec2ForSequenceClassification.from_pretrained("MelodyMachine/Deepfake-audio-detection-V2")
# Load the speaker recognition model using the proper method
class SpeakerRecognition:
def __init__(self):
self.model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
def encode_batch(self, signal):
embeddings = self.model.encode_batch(signal)
return embeddings
classifier = SpeakerRecognition()
def analyze_audio(audio):
# Load audio
signal, fs = torchaudio.load(audio)
signal = signal.mean(dim=0, keepdim=True) # Convert to mono
# Deepfake detection
deepfake_inputs = deepfake_processor(signal.squeeze().numpy(), sampling_rate=fs, return_tensors="pt", padding="longest").input_values
with torch.no_grad():
deepfake_logits = deepfake_model(deepfake_inputs).logits
deepfake_probabilities = torch.nn.functional.softmax(deepfake_logits, dim=-1)
deepfake_prediction = torch.argmax(deepfake_probabilities, dim=-1).item()
# Speaker recognition
embeddings = classifier.encode_batch(signal)
# Results
deepfake_result = "Fake" if deepfake_prediction == 1 else "Real"
speaker_embedding = embeddings.squeeze().tolist()
return deepfake_result, speaker_embedding
# Gradio interface
iface = gr.Interface(
fn=analyze_audio,
inputs=gr.Audio(type="filepath"),
outputs=[
gr.Textbox(label="Deepfake Detection"),
gr.JSON(label="Speaker Embedding")
],
title="Deepfake Audio Detection and Speaker Recognition",
description="Upload an audio file to detect if it's a deepfake and to extract speaker embeddings."
)
if __name__ == "__main__":
iface.launch()