Spaces:
Running
on
Zero
Running
on
Zero
""" | |
Unified AI-Image & Deepfake Detector | |
=================================== | |
β’ Combines a generic AI-image detector (Swin-V2 + SuSy) *and* | |
a deepfake-specialist face detector (Inception-ResNet V1). | |
β’ Always runs both experts β fuses their calibrated scores. | |
β’ Works on images **and** short videos (β€ 30 s). | |
Add/keep in requirements.txt (versions pinned earlier): | |
torch torchvision facenet-pytorch transformers torchcam captum timm | |
mediapipe opencv-python-headless pillow scikit-image matplotlib | |
gradio fpdf pandas numpy absl-py ttach | |
""" | |
# βββββββββββββββββββββ bootstrap for extra wheels ββββββββββββββββββββββ | |
import os, uuid, warnings, math, tempfile | |
from pathlib import Path | |
from typing import List, Tuple | |
warnings.filterwarnings("ignore") | |
def _ensure_deps(): | |
try: | |
import mediapipe, fpdf # noqa: F401 | |
except ImportError: | |
os.system("pip install --quiet --upgrade mediapipe fpdf") | |
_ensure_deps() | |
# βββββββββββββββββββββββββββββββ imports βββββββββββββββββββββββββββββββ | |
import cv2 | |
import gradio as gr | |
import numpy as np | |
import torch | |
import torch.nn.functional as F | |
from PIL import Image | |
from fpdf import FPDF | |
import mediapipe as mp | |
from facenet_pytorch import InceptionResnetV1, MTCNN | |
from pytorch_grad_cam import GradCAM | |
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget | |
from torchvision import transforms | |
from transformers import AutoImageProcessor, AutoModelForImageClassification | |
from torchcam.methods import GradCAM as TCGradCAM | |
from captum.attr import Saliency | |
from skimage.feature import graycomatrix, graycoprops | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import spaces | |
# βββββββββββββββββββββββββ runtime / models ββββββββββββββββββββββββββββ | |
plt.set_loglevel("ERROR") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Deep-fake specialist | |
_face_det = MTCNN(select_largest=False, post_process=False, device=device).eval().to(device) | |
_df_model = InceptionResnetV1(pretrained="vggface2", classify=True, num_classes=1, device=device) | |
_df_model.load_state_dict(torch.load("resnet_inception.pth", map_location="cpu")["model_state_dict"]) | |
_df_model.to(device).eval() | |
_df_cam = GradCAM(_df_model, target_layers=[_df_model.block8.branch1[-1]], | |
use_cuda=device.type == "cuda") | |
# Helper: robust layer fetch | |
def _get_layer(model, name: str): | |
mods = dict(model.named_modules()) | |
return mods.get(name) or next(m for n, m in mods.items() if n.endswith(name)) | |
# Binary AI-image detector (Swin-V2) | |
BIN_ID = "haywoodsloan/ai-image-detector-deploy" | |
_bin_proc = AutoImageProcessor.from_pretrained(BIN_ID) | |
_bin_mod = AutoModelForImageClassification.from_pretrained(BIN_ID).to(device).eval() | |
_CAM_LAYER_BIN = "encoder.layers.3.blocks.1.layernorm_after" | |
_bin_cam = TCGradCAM(_bin_mod, target_layer=_get_layer(_bin_mod, _CAM_LAYER_BIN)) | |
# Generator classifier (SuSy β ScriptModule β Captum only) | |
_susy_mod = torch.jit.load("SuSy.pt").to(device).eval() | |
_GEN_CLASSES = ["Stable Diffusion 1.x", "DALLΒ·E 3", | |
"MJ V5/V6", "Stable Diffusion XL", "MJ V1/V2"] | |
_PATCH, _TOP = 224, 5 | |
_to_tensor = transforms.ToTensor() | |
_to_gray = transforms.Compose([transforms.PILToTensor(), transforms.Grayscale()]) | |
# βββββββββββββββ calibration placeholders (optional tune) ββββββββββββββ | |
_calib_df_slope, _calib_df_inter = 1.0, 0.0 | |
_calib_ai_slope, _calib_ai_inter = 1.0, 0.0 | |
# def _calibrate_df(p: float) -> float: | |
# def _calibrate_ai(p: float) -> float: | |
# return 1 / (1 + math.exp(-(_calib_ai_slope * (p + _calib_ai_inter)))) | |
def _calibrate_df(p: float) -> float: # keep raw score for now | |
return p | |
def _calibrate_ai(p: float) -> float: | |
return p | |
# βββββββββββββββββββββββββββββ misc helpers ββββββββββββββββββββββββββββ | |
UNCERTAIN_GAP = 0.10 | |
MIN_FRAMES, MAX_SAMPLES = 4, 20 | |
def _extract_landmarks(rgb: np.ndarray) -> Tuple[np.ndarray, np.ndarray | None]: | |
mesh = mp.solutions.face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1) | |
res = mesh.process(rgb); mesh.close() | |
if not res.multi_face_landmarks: | |
return rgb, None | |
h, w, _ = rgb.shape | |
out = rgb.copy() | |
for lm in res.multi_face_landmarks[0].landmark: | |
cx, cy = int(lm.x * w), int(lm.y * h) | |
cv2.circle(out, (cx, cy), 1, (0, 255, 0), -1) | |
return out, None | |
def _overlay_cam(cam, base): | |
# ---- NEW: make sure 'cam' is a NumPy array on CPU ---- | |
if torch.is_tensor(cam): # covers torchcam output | |
cam = cam.detach().cpu().numpy() | |
# ------------------------------------------------------ | |
cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-6) | |
heat = Image.fromarray( | |
(plt.cm.jet(cam)[:, :, :3] * 255).astype(np.uint8) | |
).resize((base.shape[1], base.shape[0]), Image.BICUBIC) | |
return Image.blend( | |
Image.fromarray(base).convert("RGBA"), | |
heat.convert("RGBA"), | |
alpha=0.45, | |
) | |
def _render_pdf(title: str, verdict: str, conf: dict, pages: List[Image.Image]) -> str: | |
out = Path(f"/tmp/report_{uuid.uuid4().hex}.pdf") | |
pdf = FPDF(); pdf.set_auto_page_break(True, 15); pdf.add_page() | |
pdf.set_font("Helvetica", size=14); pdf.cell(0, 10, title, ln=True, align="C") | |
pdf.ln(4); pdf.set_font("Helvetica", size=12) | |
pdf.multi_cell(0, 6, f"Verdict: {verdict}\n" | |
f"Confidence -> Real {conf['real']:.3f} Fake {conf['fake']:.3f}") | |
for idx, img in enumerate(pages): | |
pdf.ln(4); pdf.set_font("Helvetica", size=11) | |
pdf.cell(0, 6, f"Figure {idx+1}", ln=True) | |
tmp = Path(tempfile.mktemp(suffix=".jpg")) | |
img.convert("RGB").save(tmp, format="JPEG") # β add .convert("RGB") | |
pdf.image(str(tmp), x=10, w=90) | |
tmp.unlink(missing_ok=True) | |
pdf.output(out) | |
return str(out) | |
# ββββββββββββββββββββββββββ SuSy helpers (saliency) ββββββββββββββββββββ | |
def _susy_cam(tensor: torch.Tensor, class_idx: int) -> np.ndarray: | |
sal = Saliency(_susy_mod) | |
grad = sal.attribute(tensor, target=class_idx).abs().mean(1, keepdim=True) | |
return grad.squeeze().detach().cpu().numpy() | |
def _susy_predict(img: Image.Image): | |
w, h = img.size | |
npx, npy = max(1, w // _PATCH), max(1, h // _PATCH) | |
patches = np.zeros((npx * npy, _PATCH, _PATCH, 3), dtype=np.uint8) | |
for i in range(npx): | |
for j in range(npy): | |
x, y = i * _PATCH, j * _PATCH | |
patches[i*npy + j] = np.array(img.crop((x, y, x+_PATCH, y+_PATCH)) | |
.resize((_PATCH, _PATCH))) | |
contrasts = [] | |
for p in patches: | |
g = _to_gray(Image.fromarray(p)).squeeze(0).numpy() | |
glcm = graycomatrix(g, [5], [0], 256, symmetric=True, normed=True) | |
contrasts.append(graycoprops(glcm, "contrast")[0, 0]) | |
idx = np.argsort(contrasts)[::-1][:_TOP] | |
tens = torch.from_numpy(patches[idx].transpose(0, 3, 1, 2)).float() / 255.0 | |
with torch.no_grad(): | |
probs = _susy_mod(tens.to(device)).softmax(-1).mean(0).cpu().numpy()[1:] | |
return dict(zip(_GEN_CLASSES, probs)) | |
# βββββββββββββββββββββββββββββ fusion math βββββββββββββββββββββββββββββ | |
def _fuse(p_ai: float, p_df: float) -> float: | |
return 1 - (1 - p_ai) * (1 - p_df) | |
def _verdict(p: float) -> str: | |
return "uncertain" if abs(p - 0.5) <= UNCERTAIN_GAP else ("fake" if p > 0.5 else "real") | |
# βββββββββββββββββββββββββββ IMAGE PIPELINE ββββββββββββββββββββββββββββ | |
def _predict_image(pil: Image.Image): | |
gallery: List[Image.Image] = [] | |
# Deep-fake path | |
try: | |
face = _face_det(pil) | |
except Exception: | |
face = None | |
if face is not None: | |
ft = F.interpolate(face.unsqueeze(0), (256, 256), mode="bilinear", | |
align_corners=False).float() / 255.0 | |
p_df_raw = torch.sigmoid(_df_model(ft.to(device))).item() | |
p_df = _calibrate_df(p_df_raw) | |
crop_np = (ft.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8) | |
cam_df = _df_cam(ft, [ClassifierOutputTarget(0)])[0] | |
gallery.append(_overlay_cam(cam_df, crop_np)) | |
gallery.append(Image.fromarray(_extract_landmarks( | |
cv2.cvtColor(np.array(pil), cv2.COLOR_BGR2RGB))[0])) | |
else: | |
p_df = 0.5 | |
# Binary AI model | |
inp_bin = _bin_proc(images=pil, return_tensors="pt").to(device) | |
logits = _bin_mod(**inp_bin).logits.softmax(-1)[0] | |
p_ai_raw = logits[0].item() | |
p_ai = _calibrate_ai(p_ai_raw) | |
winner_idx = 0 if p_ai_raw >= logits[1].item() else 1 | |
inp_bin_h = {k: v.clone().detach().requires_grad_(True) for k, v in inp_bin.items()} | |
cam_bin = _bin_cam(winner_idx, scores=_bin_mod(**inp_bin_h).logits)[0] | |
gallery.append(_overlay_cam(cam_bin, np.array(pil))) | |
# Generator breakdown (SuSy) if AI | |
bar_plot = gr.update(visible=False) | |
if p_ai_raw > logits[1].item(): | |
gen_probs = _susy_predict(pil) | |
bar_plot = gr.update(value=pd.DataFrame(gen_probs.items(), columns=["class", "prob"]), | |
visible=True) | |
susy_in = _to_tensor(pil.resize((224, 224))).unsqueeze(0).to(device) | |
g_idx = _susy_mod(susy_in)[0, 1:].argmax().item() + 1 | |
cam_susy = _susy_cam(susy_in, g_idx) | |
gallery.append(_overlay_cam(cam_susy, np.array(pil))) | |
# Fusion | |
p_final = _fuse(p_ai, p_df) | |
verdict = _verdict(p_final) | |
conf = {"real": round(1-p_final, 4), "fake": round(p_final, 4)} | |
pdf = _render_pdf("Unified Detector", verdict, conf, gallery[:3]) | |
return verdict, conf, gallery, bar_plot, pdf | |
# βββββββββββββββββββββββββββ VIDEO PIPELINE ββββββββββββββββββββββββββββ | |
def _sample_idx(n): # max 20 evenly spaced | |
return list(range(n)) if n <= MAX_SAMPLES else np.linspace(0, n-1, MAX_SAMPLES, dtype=int) | |
def _predict_video(path: str): | |
cap = cv2.VideoCapture(path); total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 1 | |
probs, frames = [], [] | |
for i in _sample_idx(total): | |
cap.set(cv2.CAP_PROP_POS_FRAMES, i) | |
ok, frm = cap.read() | |
if not ok: | |
continue | |
pil = Image.fromarray(cv2.cvtColor(frm, cv2.COLOR_BGR2RGB)) | |
verdict, conf, _, _, _ = _predict_image(pil) | |
probs.append(conf["fake"]) | |
if len(frames) < MIN_FRAMES: | |
frames.append(Image.fromarray(frm)) | |
cap.release() | |
if not probs: | |
blank = Image.new("RGB", (256, 256)) | |
return "No frames analysed", {"real": 0, "fake": 0}, [blank] | |
p_final = float(np.mean(probs)) | |
return _verdict(p_final), {"real": round(1-p_final, 4), "fake": round(p_final, 4)}, frames | |
# βββββββββββββββββββββββββββββββββ UI ββββββββββββββββββββββββββββββββββ | |
_css = "footer{visibility:hidden!important}.logo,#logo{display:none!important}" | |
with gr.Blocks(css=_css, title="Unified AI-Fake & Deepfake Detector") as demo: | |
gr.Markdown(""" | |
## Unified AI-Fake & Deepfake Detector | |
Upload an **image** or a short **video**. | |
The app fuses two complementary models, then shows heat-maps & a PDF report. | |
""") | |
with gr.Tab("Image"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
img_in = gr.Image(label="Upload image", type="pil") | |
btn_i = gr.Button("Analyze") | |
with gr.Column(scale=2): | |
txt_v = gr.Textbox(label="Verdict", interactive=False) | |
lbl_c = gr.Label(label="Confidence") | |
gal = gr.Gallery(label="Explanations", columns=3, height=320) | |
bar = gr.BarPlot(x="class", y="prob", title="Likely generator", | |
y_label="probability", visible=False) | |
pdf_f = gr.File(label="Download PDF report") | |
btn_i.click(_predict_image, img_in, [txt_v, lbl_c, gal, bar, pdf_f]) | |
with gr.Tab("Video"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
vid_in = gr.Video(label="Upload MP4/AVI", format="mp4") | |
btn_v = gr.Button("Analyze") | |
with gr.Column(scale=2): | |
txt_vv = gr.Textbox(label="Verdict", interactive=False) | |
lbl_cv = gr.Label(label="Confidence") | |
gal_v = gr.Gallery(label="Sample frames", columns=4, height=240) | |
btn_v.click(_predict_video, vid_in, [txt_vv, lbl_cv, gal_v]) | |
demo.launch(share=True, show_api=False) | |