mariam-ahmed15's picture
Update app.py
2919490 verified
import torch
import gradio as gr
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import os
# 1. CONFIGURATION
MODEL_ID = "facebook/wav2vec2-xls-r-300m"
QUANTIZED_MODEL_PATH = "quantized_model.pth"
# 2. LOAD MODEL
print("Loading model architecture...")
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
# Apply quantization
model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
# Load weights
if os.path.exists(QUANTIZED_MODEL_PATH):
print("Loading quantized weights...")
try:
model.load_state_dict(torch.load(QUANTIZED_MODEL_PATH, map_location=torch.device('cpu')))
except Exception as e:
print(f"Error loading weights: {e}")
else:
print(f"Warning: {QUANTIZED_MODEL_PATH} not found. Using random weights.")
model.eval()
# 3. PREDICTION FUNCTION (NO CHUNKING)
def predict_audio(audio_path):
if audio_path is None:
return "No Audio Provided"
try:
# Load the entire audio file at 16k sample rate
# librosa handles the conversion of WhatsApp files (.opus, .m4a) automatically
speech_array, sr = librosa.load(audio_path, sr=16000)
# Process the WHOLE file at once
# truncation=False ensures we do not cut the file
inputs = feature_extractor(
speech_array,
sampling_rate=16000,
return_tensors="pt",
padding=True,
truncation=False
)
with torch.no_grad():
# Pass the massive input tensor to the model
logits = model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=-1)
# Extract probabilities
# Label 1 = Deepfake, Label 0 = Real (Verify your mapping)
fake_prob = probs[0][1].item()
real_prob = probs[0][0].item()
return {
"Deepfake": fake_prob,
"Real": real_prob
}
except RuntimeError as e:
if "memory" in str(e).lower():
return "Error: File too long (Out of Memory). Please use a shorter clip."
return f"Runtime Error: {str(e)}"
except Exception as e:
return f"Error processing audio: {str(e)}"
# 4. CREATE INTERFACE
iface = gr.Interface(
fn=predict_audio,
inputs=gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Upload WhatsApp Voice Note"
),
outputs=gr.Label(num_top_classes=2),
title="Deepfake Audio Detection API (No Chunking)",
description="Analyzes the full audio file in one pass."
)
iface.launch()