Spaces:
Build error
Build error
import re | |
import glob | |
import pickle | |
import os | |
import torch | |
import numpy as np | |
from utils.audio import load_spectrograms | |
from utils.compute_args import compute_args | |
from utils.tokenize import tokenize, create_dict, sent_to_ix, cmumosei_2, cmumosei_7, pad_feature | |
from model_LA import Model_LA | |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
working_dir = "." | |
# load model | |
ckpts_path = os.path.join(working_dir, 'ckpt') | |
model_name = "Model_LA_e" | |
# Listing sorted checkpoints | |
ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name,'best*')), reverse=True) | |
# Load original args | |
args = torch.load(ckpts[0], map_location=torch.device(device))['args'] | |
args = compute_args(args) | |
pretrained_emb = np.load("train_glove.npy") | |
token_to_ix = pickle.load(open("token_to_ix.pkl", "rb")) | |
state_dict = torch.load(ckpts[0], map_location=torch.device(device))['state_dict'] | |
net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device) | |
net.load_state_dict(state_dict) | |
def inference(video_path, text): | |
# data preprocessing | |
# text | |
def clean(w): | |
return re.sub( | |
r"([.,'!?\"()*#:;])", | |
'', | |
w.lower() | |
).replace('-', ' ').replace('/', ' ') | |
s = [clean(w) for w in text.split() if clean(w) != ''] | |
# Sound | |
_, mel, mag = load_spectrograms(video_path) | |
l_max_len = args.lang_seq_len | |
a_max_len = args.audio_seq_len | |
v_max_len = args.video_seq_len | |
L = sent_to_ix(s, token_to_ix, max_token=l_max_len) | |
A = pad_feature(mel, a_max_len) | |
V = pad_feature(mel, v_max_len) | |
# print shapes | |
print(f"Processed text shape from {len(s)} to {L.shape}") | |
print(f"Processed audio shape from {mel.shape} to {A.shape}") | |
print(f"Processed video shape from {mel.shape} to {V.shape}") | |
net.train(False) | |
x = np.expand_dims(L,axis=0) | |
y = np.expand_dims(A,axis=0) | |
z = np.expand_dims(V,axis=0) | |
x, y, z = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device), torch.from_numpy(z).float().to(device) | |
pred = net(x, y, z).cpu().data.numpy() | |
label_to_ix = ['happy', 'sad', 'angry', 'fear', 'disgust', 'surprise'] | |
result_dict = dict(zip(label_to_ix, pred[0])) | |
return out | |
title="Emotion Recognition" | |
description="This is a demo implementation of EfficientNetV2 Deepfakes Image Detector by using frame-by-frame detection. \ | |
To use it, simply upload your video, or click one of the examples to load them.\ | |
This demo and model represent the work of \"Achieving Face Swapped Deepfakes Detection Using EfficientNetV2\" by Lee Sheng Yeh. \ | |
The examples were extracted from Celeb-DF(V2)(Li et al, 2020) and FaceForensics++(Rossler et al., 2019). Full reference details is available in \"references.txt.\" \ | |
The examples are used under fair use to demo the working of the model only. If any copyright is infringed, please contact the researcher via this email: tp054565@mail.apu.edu.my, the researcher will immediately take down the examples used.\ | |
" | |
examples = [ | |
['examples/03bSnISJMiM_1.mp4', "IT WAS REALLY GOOD "], | |
['examples/03bSnISJMiM_5.mp4', "AND THEY SHOULDVE I GUESS "], | |
] | |
gr.Interface(inference, | |
inputs = ["video", "text"], | |
outputs=["label"], | |
title=title, | |
description=description, | |
examples=examples | |
).launch(debug=True) |