File size: 2,724 Bytes
57b92a7
fc3814c
 
 
5221dd6
8385212
fc3814c
 
 
 
5221dd6
fc3814c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5221dd6
fc3814c
 
 
 
 
 
 
 
 
 
5221dd6
 
 
fc3814c
8e048a9
fc3814c
bb096b5
20ea6da
5fca1e8
 
b164597
 
8385212
 
 
 
 
 
 
 
 
bcb7084
59b1a70
2d02098
59b1a70
4f8dcdf
57b92a7
59b1a70
83a5ea4
4f8dcdf
57b92a7
5fca1e8
 
d06963a
5fca1e8
d06963a
fc3814c
 
 
d9b9435
e61bb8b
5fca1e8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import io
import torch
from torch.utils.model_zoo import load_url
from PIL import Image
from scipy.special import expit
import matplotlib.pyplot as plt

import sys
sys.path.append('./icpr2020dfdc/')

from blazeface import FaceExtractor, BlazeFace, VideoReader
from architectures import fornet,weights
from isplutils import utils

import gradio as gr



"""
Choose an architecture between
- EfficientNetB4
- EfficientNetB4ST
- EfficientNetAutoAttB4
- EfficientNetAutoAttB4ST
- Xception
"""
net_model = 'EfficientNetAutoAttB4'

"""
Choose a training dataset between
- DFDC
- FFPP
"""
train_db = 'DFDC'

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
face_policy = 'scale'
face_size = 224
frames_per_video = 32

model_url = weights.weight_url['{:s}_{:s}'.format(net_model,train_db)]
net = getattr(fornet,net_model)().eval().to(device)
net.load_state_dict(load_url(model_url,map_location=device,check_hash=True))

transf = utils.get_transformer(face_policy, face_size, net.get_normalizer(), train=False)

facedet = BlazeFace().to(device)
facedet.load_weights("./icpr2020dfdc/blazeface/blazeface.pth")
facedet.load_anchors("./icpr2020dfdc/blazeface/anchors.npy")
videoreader = VideoReader(verbose=False)
video_read_fn = lambda x: videoreader.read_frames(x, num_frames=frames_per_video)
face_extractor = FaceExtractor(video_read_fn=video_read_fn,facedet=facedet)

title = "Face Manipulation Detection Through Ensemble of CNNs"

def inference(vid):
    #return "./Labels/Fake.png", f"{vid}"
    vid_real_faces = face_extractor.process_video(vid)
    faces_real_t = torch.stack( [ transf(image=frame['faces'][0])['image'] for frame in vid_real_faces if len(frame['faces'])] )
    with torch.no_grad():
        faces_real_pred = net(faces_real_t.to(device)).cpu().numpy().flatten()

    # fig,ax = plt.subplots(1,2,figsize=(12,4))

    plt.stem([f['frame_idx'] for f in vid_real_faces if len(f['faces'])],expit(faces_real_pred),use_line_collection=True)
    plt.title('Score per Frame')
    plt.xlabel('Frame')
    plt.ylabel('Score')
    plt.ylim([0,1])
    plt.grid()
    
    img_buf = io.BytesIO()
    img_buf.truncate(0) 
    plt.savefig(img_buf, format='png')
    img_buf.seek(0)
    
    im = Image.open(img_buf)

    
    
    res = expit(faces_real_pred.mean())
    if res >= 0.5:
        return "./Labels/Fake.jpg",im, f"{res*100:.2f}%"
    else:
        return "./Labels/Real.jpg",im, f"{res*100:.2f}%"

demo = gr.Interface(
    fn=inference, 
    inputs=[gr.inputs.Video(type="mp4", label="In")],
    outputs=[gr.outputs.Image(type="pil", label="Label"), gr.outputs.Image(type="pil", label="Score per Frame")  ,gr.outputs.Label(type="text", label="Score") ]
    ).launch(debug=True)