|
|
import torch |
|
|
import gradio as gr |
|
|
import librosa |
|
|
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor |
|
|
import os |
|
|
|
|
|
|
|
|
MODEL_ID = "facebook/wav2vec2-xls-r-300m" |
|
|
QUANTIZED_MODEL_PATH = "quantized_model.pth" |
|
|
|
|
|
|
|
|
print("Loading model architecture...") |
|
|
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2) |
|
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID) |
|
|
|
|
|
|
|
|
model = torch.quantization.quantize_dynamic( |
|
|
model, {torch.nn.Linear}, dtype=torch.qint8 |
|
|
) |
|
|
|
|
|
|
|
|
if os.path.exists(QUANTIZED_MODEL_PATH): |
|
|
print("Loading quantized weights...") |
|
|
try: |
|
|
model.load_state_dict(torch.load(QUANTIZED_MODEL_PATH, map_location=torch.device('cpu'))) |
|
|
except Exception as e: |
|
|
print(f"Error loading weights: {e}") |
|
|
else: |
|
|
print(f"Warning: {QUANTIZED_MODEL_PATH} not found. Using random weights.") |
|
|
|
|
|
model.eval() |
|
|
|
|
|
|
|
|
def predict_audio(audio_path): |
|
|
if audio_path is None: |
|
|
return "No Audio Provided" |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
speech_array, sr = librosa.load(audio_path, sr=16000) |
|
|
|
|
|
|
|
|
|
|
|
inputs = feature_extractor( |
|
|
speech_array, |
|
|
sampling_rate=16000, |
|
|
return_tensors="pt", |
|
|
padding=True, |
|
|
truncation=False |
|
|
) |
|
|
|
|
|
with torch.no_grad(): |
|
|
|
|
|
logits = model(**inputs).logits |
|
|
probs = torch.nn.functional.softmax(logits, dim=-1) |
|
|
|
|
|
|
|
|
|
|
|
fake_prob = probs[0][1].item() |
|
|
real_prob = probs[0][0].item() |
|
|
|
|
|
return { |
|
|
"Deepfake": fake_prob, |
|
|
"Real": real_prob |
|
|
} |
|
|
|
|
|
except RuntimeError as e: |
|
|
if "memory" in str(e).lower(): |
|
|
return "Error: File too long (Out of Memory). Please use a shorter clip." |
|
|
return f"Runtime Error: {str(e)}" |
|
|
except Exception as e: |
|
|
return f"Error processing audio: {str(e)}" |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=predict_audio, |
|
|
inputs=gr.Audio( |
|
|
sources=["upload", "microphone"], |
|
|
type="filepath", |
|
|
label="Upload WhatsApp Voice Note" |
|
|
), |
|
|
outputs=gr.Label(num_top_classes=2), |
|
|
title="Deepfake Audio Detection API (No Chunking)", |
|
|
description="Analyzes the full audio file in one pass." |
|
|
) |
|
|
|
|
|
iface.launch() |