Spaces:

PraneshJs
/

fakevideodetect

Running

App Files Files Community

PraneshJs commited on Aug 13

Commit

6d0c2aa

verified ·

1 Parent(s): 2b0e2be

Update inference_2.py

Browse files

Files changed (1) hide show

inference_2.py +64 -59

inference_2.py CHANGED Viewed

@@ -3,22 +3,19 @@ import cv2
 import onnx
 import torch
 import numpy as np
-from types import SimpleNamespace
-from onnx2pytorch import ConvertModel
 from models.TMC import ETMC
 from models import image
-# -----------------------------
-# Load ONNX -> PyTorch safely
-# -----------------------------
 onnx_model = onnx.load('checkpoints/efficientnet.onnx')
 pytorch_model = ConvertModel(onnx_model)
 torch.manual_seed(42)
-# -----------------------------
-# Audio model arguments
-# -----------------------------
 audio_args = {
     'nb_samp': 64600,
     'first_conv': 1024,
@@ -28,53 +25,49 @@ audio_args = {
     'nb_fc_node': 1024,
     'gru_node': 1024,
     'nb_gru_layer': 3,
-    'nb_classes': 2,
     'device': 'cpu'
 }
-audio_args_obj = SimpleNamespace(**audio_args)
-# -----------------------------
-# Load Audio Model
-# -----------------------------
-def load_audio_model():
-    spec_model = image.RawNet(audio_args_obj)
-    ckpt = torch.load('checkpoints/model.pth', map_location='cpu')
-    spec_model.load_state_dict(ckpt['spec_encoder'], strict=True)
-    spec_model.eval()
-    return spec_model
-spec_model = load_audio_model()
-# -----------------------------
-# Load Image Model
-# -----------------------------
-def load_image_model():
-    rgb_encoder = pytorch_model
-    ckpt = torch.load('checkpoints/model.pth', map_location='cpu')
-    rgb_encoder.load_state_dict(ckpt['rgb_encoder'], strict=True)
-    rgb_encoder.eval()
-    return rgb_encoder
-img_model = load_image_model()
-# -----------------------------
 # Preprocessing functions
-# -----------------------------
 def preprocess_img(face):
     face = face / 255.0
     face = cv2.resize(face, (256, 256))
     face_pt = torch.unsqueeze(torch.Tensor(face), dim=0)
     return face_pt
 def preprocess_audio(audio_file):
     audio_pt = torch.unsqueeze(torch.Tensor(audio_file), dim=0)
     return audio_pt
 def preprocess_video(input_video, n_frames=3):
     v_cap = cv2.VideoCapture(input_video)
     v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    sample = np.linspace(0, v_len-1, n_frames).astype(int)
     frames = []
     for j in range(v_len):
         success = v_cap.grab()
@@ -88,40 +81,52 @@ def preprocess_video(input_video, n_frames=3):
     v_cap.release()
     return frames
-# -----------------------------
-# Inference functions
-# -----------------------------
 def deepfakes_spec_predict(input_audio):
-    audio = preprocess_audio(input_audio)
-    spec_grads = spec_model.forward(audio)
-    spec_grads_np = np.exp(spec_grads.cpu().detach().numpy().squeeze())
-    max_value = np.argmax(spec_grads_np)
-    if max_value > 0.5:
         text2 = f"The audio is REAL."
     else:
         text2 = f"The audio is FAKE."
     return text2
 def deepfakes_image_predict(input_image):
     face = preprocess_img(input_image)
-    img_grads = img_model.forward(face).cpu().detach().numpy().squeeze()
-    if img_grads[0] > 0.5:
-        text2 = f"The image is REAL. Confidence: {img_grads[0]*100:.3f}%"
     else:
-        text2 = f"The image is FAKE. Confidence: {img_grads[1]*100:.3f}%"
     return text2
 def deepfakes_video_predict(input_video):
     video_frames = preprocess_video(input_video)
-    real_list, fake_list = [], []
     for face in video_frames:
-        img_grads = img_model.forward(face).cpu().detach().numpy().squeeze()
-        real_list.append(img_grads[0])
-        fake_list.append(img_grads[1])
-    real_mean = np.mean(real_list)
-    fake_mean = np.mean(fake_list)
-    if real_mean > 0.5:
-        text2 = f"The video is REAL. Confidence: {real_mean*100:.3f}%"
     else:
-        text2 = f"The video is FAKE. Confidence: {fake_mean*100:.3f}%"
     return text2

 import onnx
 import torch
 import numpy as np
+import argparse
 from models.TMC import ETMC
 from models import image
+from onnx2pytorch import ConvertModel
+import types
+# Load ONNX model and convert to PyTorch
 onnx_model = onnx.load('checkpoints/efficientnet.onnx')
 pytorch_model = ConvertModel(onnx_model)
 torch.manual_seed(42)
+# Audio model parameters
 audio_args = {
     'nb_samp': 64600,
     'first_conv': 1024,
     'nb_fc_node': 1024,
     'gru_node': 1024,
     'nb_gru_layer': 3,
+    'nb_classes': 2
+}
+# Create a complete args object for RawNet
+audio_args_complete = {
+    **audio_args,
+    'pretrained_audio_encoder': False,
+    'freeze_audio_encoder': False,
     'device': 'cpu'
 }
+audio_args_obj = types.SimpleNamespace(**audio_args_complete)
+# Load models
+spec_model = image.RawNet(audio_args_obj)
+spec_model_ckpt = torch.load('checkpoints/model.pth', map_location='cpu')
+spec_model.load_state_dict(spec_model_ckpt['spec_encoder'], strict=True)
+spec_model.eval()
+img_model = pytorch_model
+img_model_ckpt = torch.load('checkpoints/model.pth', map_location='cpu')
+img_model.load_state_dict(img_model_ckpt['rgb_encoder'], strict=True)
+img_model.eval()
 # Preprocessing functions
 def preprocess_img(face):
     face = face / 255.0
     face = cv2.resize(face, (256, 256))
     face_pt = torch.unsqueeze(torch.Tensor(face), dim=0)
     return face_pt
 def preprocess_audio(audio_file):
     audio_pt = torch.unsqueeze(torch.Tensor(audio_file), dim=0)
     return audio_pt
 def preprocess_video(input_video, n_frames=3):
     v_cap = cv2.VideoCapture(input_video)
     v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    sample = np.linspace(0, v_len - 1, n_frames).astype(int)
     frames = []
     for j in range(v_len):
         success = v_cap.grab()
     v_cap.release()
     return frames
+# Prediction functions
 def deepfakes_spec_predict(input_audio):
+    x, _ = input_audio
+    audio = preprocess_audio(x)
+    spec_grads = spec_model(audio)
+    spec_grads_np = np.squeeze(spec_grads.detach().cpu().numpy())
+    if spec_grads_np[0] > 0.5:
         text2 = f"The audio is REAL."
     else:
         text2 = f"The audio is FAKE."
     return text2
 def deepfakes_image_predict(input_image):
     face = preprocess_img(input_image)
+    img_grads = img_model(face)
+    img_grads_np = np.squeeze(img_grads.detach().cpu().numpy())
+    if img_grads_np[0] > 0.5:
+        preds = round(img_grads_np[0] * 100, 3)
+        text2 = f"The image is REAL. \nConfidence score: {preds}%"
     else:
+        preds = round(img_grads_np[1] * 100, 3)
+        text2 = f"The image is FAKE. \nConfidence score: {preds}%"
     return text2
 def deepfakes_video_predict(input_video):
     video_frames = preprocess_video(input_video)
+    real_faces_list, fake_faces_list = [], []
     for face in video_frames:
+        img_grads = img_model(face)
+        img_grads_np = np.squeeze(img_grads.detach().cpu().numpy())
+        real_faces_list.append(img_grads_np[0])
+        fake_faces_list.append(img_grads_np[1])
+    real_faces_mean = np.mean(real_faces_list)
+    fake_faces_mean = np.mean(fake_faces_list)
+    if real_faces_mean > 0.5:
+        preds = round(real_faces_mean * 100, 3)
+        text2 = f"The video is REAL. \nConfidence score: {preds}%"
     else:
+        preds = round(fake_faces_mean * 100, 3)
+        text2 = f"The video is FAKE. \nConfidence score: {preds}%"
     return text2