File size: 3,546 Bytes
028a426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7551cd5
 
 
 
 
 
 
 
 
 
 
 
028a426
 
 
 
 
 
 
 
 
 
 
 
7551cd5
 
028a426
 
 
 
7551cd5
028a426
 
 
7551cd5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
import glob
import pickle
import os
import torch
import numpy as np
from utils.audio import load_spectrograms
from utils.compute_args import compute_args
from utils.tokenize import tokenize, create_dict, sent_to_ix, cmumosei_2, cmumosei_7, pad_feature
from model_LA import Model_LA

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

working_dir = "."

# load model
ckpts_path = os.path.join(working_dir, 'ckpt')
model_name = "Model_LA_e"
# Listing sorted checkpoints
ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name,'best*')), reverse=True)

# Load original args
args = torch.load(ckpts[0], map_location=torch.device(device))['args']
args = compute_args(args)
pretrained_emb = np.load("train_glove.npy")
token_to_ix = pickle.load(open("token_to_ix.pkl", "rb")) 
state_dict = torch.load(ckpts[0], map_location=torch.device(device))['state_dict']

net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device)
net.load_state_dict(state_dict)

def inference(video_path, text):
    # data preprocessing
    # text
    def clean(w):
        return re.sub(
                r"([.,'!?\"()*#:;])",
                '',
                w.lower()
                ).replace('-', ' ').replace('/', ' ')

    s = [clean(w) for w in text.split() if clean(w) != '']

    # Sound
    _, mel, mag = load_spectrograms(video_path)

    l_max_len = args.lang_seq_len
    a_max_len = args.audio_seq_len
    v_max_len = args.video_seq_len
    L = sent_to_ix(s, token_to_ix, max_token=l_max_len)
    A = pad_feature(mel, a_max_len)
    V = pad_feature(mel, v_max_len)
    # print shapes
    print(f"Processed text shape from {len(s)} to {L.shape}")
    print(f"Processed audio shape from {mel.shape} to {A.shape}")
    print(f"Processed video shape from {mel.shape} to {V.shape}")

    net.train(False)
    x = np.expand_dims(L,axis=0)
    y = np.expand_dims(A,axis=0)
    z = np.expand_dims(V,axis=0)
    x, y, z = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device), torch.from_numpy(z).float().to(device)
    pred = net(x, y, z).cpu().data.numpy()
    label_to_ix = ['happy', 'sad', 'angry', 'fear', 'disgust', 'surprise']
    result_dict = dict(zip(label_to_ix, pred[0]))
    return out


title="Emotion Recognition"
description="This is a demo implementation of EfficientNetV2 Deepfakes Image Detector by using frame-by-frame detection. \
            To use it, simply upload your video, or click one of the examples to load them.\
            This demo and model represent the work of \"Achieving Face Swapped Deepfakes Detection Using EfficientNetV2\" by Lee Sheng Yeh. \
            The examples were extracted from Celeb-DF(V2)(Li et al, 2020) and FaceForensics++(Rossler et al., 2019). Full reference details is available in \"references.txt.\" \
            The examples are used under fair use to demo the working of the model only. If any copyright is infringed, please contact the researcher via this email: tp054565@mail.apu.edu.my, the researcher will immediately take down the examples used.\
            "
            
examples = [              
                ['examples/03bSnISJMiM_1.mp4', "IT WAS REALLY GOOD "],
                ['examples/03bSnISJMiM_5.mp4', "AND THEY SHOULDVE I GUESS "],
           ]
           
gr.Interface(inference,
                     inputs = ["video", "text"],
                     outputs=["label"],
                     title=title,
                     description=description,
                     examples=examples
                     ).launch(debug=True)