Spaces:
Sleeping
Sleeping
ybbwcwaps
commited on
Commit
•
711b041
1
Parent(s):
3cc4a06
some FakeVD
Browse files- FakeVD/code_test/C3D_Feature_Extractor/C3D_model.py +74 -0
- FakeVD/code_test/C3D_Feature_Extractor/error.txt +0 -0
- FakeVD/code_test/C3D_Feature_Extractor/feature_extractor_frm.py +119 -0
- FakeVD/code_test/C3D_Feature_Extractor/feature_extractor_vid.py +209 -0
- FakeVD/code_test/C3D_Feature_Extractor/output_frm/douyin_6571001202379590925.hdf5 +0 -0
- FakeVD/code_test/C3D_Feature_Extractor/raw_video/douyin_6571001202379590925.mp4 +0 -0
- FakeVD/code_test/C3D_Feature_Extractor/raw_video/douyin_6583481991964921092.mp4 +0 -0
- FakeVD/code_test/main.py +67 -0
- FakeVD/code_test/models/Baselines.py +160 -0
- FakeVD/code_test/models/FANVM.py +133 -0
- FakeVD/code_test/models/SVFEND.py +110 -0
- FakeVD/code_test/models/TikTec.py +140 -0
- FakeVD/code_test/models/Trainer.py +235 -0
- FakeVD/code_test/models/Trainer_3set.py +241 -0
- FakeVD/code_test/models/coattention.py +122 -0
- FakeVD/code_test/models/layers.py +54 -0
- FakeVD/code_test/models/trm.py +80 -0
- FakeVD/code_test/predict.py +162 -0
- FakeVD/code_test/run.py +500 -0
- FakeVD/code_test/test.py +13 -0
FakeVD/code_test/C3D_Feature_Extractor/C3D_model.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
|
3 |
+
import torch.nn as nn
|
4 |
+
|
5 |
+
|
6 |
+
class C3D(nn.Module):
|
7 |
+
"""
|
8 |
+
nb_classes: nb_classes in classification task, 101 for UCF101 dataset
|
9 |
+
"""
|
10 |
+
|
11 |
+
def __init__(self, nb_classes):
|
12 |
+
super(C3D, self).__init__()
|
13 |
+
|
14 |
+
self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
15 |
+
self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
|
16 |
+
|
17 |
+
self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
18 |
+
self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
|
19 |
+
|
20 |
+
self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
21 |
+
self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
22 |
+
self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
|
23 |
+
|
24 |
+
self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
25 |
+
self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
26 |
+
self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
|
27 |
+
|
28 |
+
self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
29 |
+
self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
30 |
+
self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))
|
31 |
+
|
32 |
+
self.fc6 = nn.Linear(8192, 4096)
|
33 |
+
self.fc7 = nn.Linear(4096, 4096)
|
34 |
+
self.fc8 = nn.Linear(4096, nb_classes)
|
35 |
+
|
36 |
+
self.dropout = nn.Dropout(p=0.5)
|
37 |
+
|
38 |
+
self.relu = nn.ReLU()
|
39 |
+
|
40 |
+
def forward(self, x, feature_layer):
|
41 |
+
|
42 |
+
h = self.relu(self.conv1(x))
|
43 |
+
h = self.pool1(h)
|
44 |
+
h = self.relu(self.conv2(h))
|
45 |
+
h = self.pool2(h)
|
46 |
+
|
47 |
+
h = self.relu(self.conv3a(h))
|
48 |
+
h = self.relu(self.conv3b(h))
|
49 |
+
h = self.pool3(h)
|
50 |
+
|
51 |
+
h = self.relu(self.conv4a(h))
|
52 |
+
h = self.relu(self.conv4b(h))
|
53 |
+
h = self.pool4(h)
|
54 |
+
|
55 |
+
h = self.relu(self.conv5a(h))
|
56 |
+
h = self.relu(self.conv5b(h))
|
57 |
+
h = self.pool5(h)
|
58 |
+
|
59 |
+
h = h.reshape(-1, 8192)
|
60 |
+
out = h if feature_layer == 5 else None
|
61 |
+
h = self.relu(self.fc6(h))
|
62 |
+
out = h if feature_layer == 6 and out == None else out
|
63 |
+
h = self.dropout(h)
|
64 |
+
h = self.relu(self.fc7(h))
|
65 |
+
out = h if feature_layer == 7 and out == None else out
|
66 |
+
h = self.dropout(h)
|
67 |
+
logits = self.fc8(h)
|
68 |
+
return logits, out
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
FakeVD/code_test/C3D_Feature_Extractor/error.txt
ADDED
File without changes
|
FakeVD/code_test/C3D_Feature_Extractor/feature_extractor_frm.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
# from data_provider import *
|
3 |
+
from C3D_model import *
|
4 |
+
import json
|
5 |
+
import torchvision
|
6 |
+
import torch.optim as optim
|
7 |
+
import torch
|
8 |
+
from torch.autograd import Variable
|
9 |
+
import torch.nn.functional as F
|
10 |
+
import argparse
|
11 |
+
import os
|
12 |
+
from torch import save, load
|
13 |
+
import pickle
|
14 |
+
import time
|
15 |
+
import numpy as np
|
16 |
+
import PIL.Image as Image
|
17 |
+
import collections
|
18 |
+
#import imageio # read video
|
19 |
+
import skimage.io as io
|
20 |
+
from skimage.transform import resize
|
21 |
+
import h5py
|
22 |
+
import fnmatch
|
23 |
+
from PIL import Image
|
24 |
+
|
25 |
+
def feature_extractor():
|
26 |
+
#trainloader = Train_Data_Loader( VIDEO_DIR, resize_w=128, resize_h=171, crop_w = 112, crop_h = 112, nb_frames=16)
|
27 |
+
net = C3D(487)
|
28 |
+
print('net', net)
|
29 |
+
## Loading pretrained model from sports and finetune the last layer
|
30 |
+
net.load_state_dict(torch.load('/data1/miayuan/pretrained_models/c3d.pickle'))
|
31 |
+
if RUN_GPU :
|
32 |
+
net.cuda(0)
|
33 |
+
net.eval()
|
34 |
+
print('net', net)
|
35 |
+
feature_dim = 4096 if EXTRACTED_LAYER != 5 else 8192
|
36 |
+
video_list = os.listdir(VIDEO_DIR)
|
37 |
+
print('video_list', video_list)
|
38 |
+
if not os.path.isdir(OUTPUT_DIR):
|
39 |
+
os.mkdir(OUTPUT_DIR)
|
40 |
+
f = h5py.File(os.path.join(OUTPUT_DIR, OUTPUT_NAME), 'w')
|
41 |
+
|
42 |
+
def count_files(directory, prefix_list):
|
43 |
+
lst = os.listdir(directory)
|
44 |
+
cnt_list = [len(fnmatch.filter(lst, x+'*')) for x in prefix_list]
|
45 |
+
return cnt_list
|
46 |
+
|
47 |
+
|
48 |
+
for video_name in video_list:
|
49 |
+
video_path = os.path.join(VIDEO_DIR, video_name)
|
50 |
+
print('video_path', video_path)
|
51 |
+
#video = imageio.get_reader(video_path, 'ffmpeg')
|
52 |
+
#print('video', video)
|
53 |
+
all_cnt = count_files(video_path, ('image_'))
|
54 |
+
total_frames = all_cnt[0]
|
55 |
+
print 'Total frames: %d'%total_frames
|
56 |
+
valid_frames = total_frames/nb_frames * nb_frames
|
57 |
+
print 'Total validated frames: %d'%valid_frames
|
58 |
+
index_w = np.random.randint(resize_w - crop_w) ## crop
|
59 |
+
index_h = np.random.randint(resize_h - crop_h) ## crop
|
60 |
+
#features = np.array((valid_frames/nb_frames, feature_dim))
|
61 |
+
features = []
|
62 |
+
#print('features', features)
|
63 |
+
print 'NB features: %d' %(valid_frames/nb_frames)
|
64 |
+
#print(io.imread(os.path.join(video_path, 'image_{:04d}.jpg'.format(1))).shape)
|
65 |
+
for i in range(valid_frames/nb_frames) :
|
66 |
+
clip = np.array([resize(io.imread(os.path.join(video_path, 'image_{:04d}.jpg'.format(j))), output_shape=(resize_w, resize_h), preserve_range=True) for j in range(i * nb_frames+1, (i+1) * nb_frames+1)])
|
67 |
+
#clip = np.array([resize(video.get_data(j), output_shape=(resize_w, resize_h), preserve_range=True) for j in range(i * nb_frames, (i+1) * nb_frames)])
|
68 |
+
clip = clip[:, index_w: index_w+ crop_w, index_h: index_h+ crop_h, :]
|
69 |
+
clip = torch.from_numpy(np.float32(clip.transpose(3, 0, 1, 2)))
|
70 |
+
clip = Variable(clip).cuda() if RUN_GPU else Variable(clip)
|
71 |
+
clip = clip.resize(1, 3, nb_frames, crop_w, crop_h)
|
72 |
+
#print('clip', clip)
|
73 |
+
_, clip_output = net(clip, EXTRACTED_LAYER)
|
74 |
+
#print('clip_output', clip_output)
|
75 |
+
clip_feature = (clip_output.data).cpu()
|
76 |
+
features.append(clip_feature)
|
77 |
+
#features[i] = np.array(clip_feature)
|
78 |
+
features = torch.cat(features, 0)
|
79 |
+
features = features.numpy()
|
80 |
+
print('features', features)
|
81 |
+
|
82 |
+
fgroup = f.create_group(video_name)
|
83 |
+
fgroup.create_dataset('c3d_features', data=features)
|
84 |
+
fgroup.create_dataset('total_frames', data=np.array(total_frames))
|
85 |
+
fgroup.create_dataset('valid_frames', data=np.array(valid_frames))
|
86 |
+
|
87 |
+
#with open(os.path.join(OUTPUT_DIR, video_name[:-4]), 'wb') as f :
|
88 |
+
# pickle.dump( features, f )
|
89 |
+
print '%s has been processed...'%video_name
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
|
93 |
+
parser = argparse.ArgumentParser()
|
94 |
+
|
95 |
+
print '******--------- Extract C3D features ------*******'
|
96 |
+
parser.add_argument('-o', '--OUTPUT_DIR', dest='OUTPUT_DIR', type=str, default='./output_frm/', help='Output file name')
|
97 |
+
parser.add_argument('-l', '--EXTRACTED_LAYER', dest='EXTRACTED_LAYER', type=int, choices=[5, 6, 7], default=5, help='Feature extractor layer')
|
98 |
+
parser.add_argument('-i', '--VIDEO_DIR', dest='VIDEO_DIR', type = str, help='Input Video directory')
|
99 |
+
parser.add_argument('-gpu', '--gpu', dest='GPU', action = 'store_true', help='Run GPU?')
|
100 |
+
parser.add_argument('--OUTPUT_NAME', default='c3d_features.hdf5', help='The output name of the hdf5 features')
|
101 |
+
|
102 |
+
args = parser.parse_args()
|
103 |
+
params = vars(args) # convert to ordinary dict
|
104 |
+
print 'parsed parameters:'
|
105 |
+
print json.dumps(params, indent = 2)
|
106 |
+
|
107 |
+
OUTPUT_DIR = params['OUTPUT_DIR']
|
108 |
+
EXTRACTED_LAYER = params['EXTRACTED_LAYER']
|
109 |
+
VIDEO_DIR = params['VIDEO_DIR']
|
110 |
+
RUN_GPU = params['GPU']
|
111 |
+
OUTPUT_NAME = params['OUTPUT_NAME']
|
112 |
+
crop_w = 112
|
113 |
+
resize_w = 128
|
114 |
+
crop_h = 112
|
115 |
+
resize_h = 171
|
116 |
+
nb_frames = 16
|
117 |
+
feature_extractor()
|
118 |
+
|
119 |
+
|
FakeVD/code_test/C3D_Feature_Extractor/feature_extractor_vid.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
# from data_provider import *
|
3 |
+
from .C3D_model import *
|
4 |
+
import torchvision
|
5 |
+
import torch
|
6 |
+
from torch.autograd import Variable
|
7 |
+
import torch.nn.functional as F
|
8 |
+
import argparse
|
9 |
+
import os
|
10 |
+
from torch import save, load
|
11 |
+
import pickle
|
12 |
+
import time
|
13 |
+
import numpy as np
|
14 |
+
import PIL.Image as Image
|
15 |
+
import skimage.io as io
|
16 |
+
from skimage.transform import resize
|
17 |
+
import h5py
|
18 |
+
from PIL import Image
|
19 |
+
|
20 |
+
def load_model_c3d(RUN_GPU = False):
|
21 |
+
net = C3D(487)
|
22 |
+
# print('net', net)
|
23 |
+
## Loading pretrained model from sports and finetune the last layer
|
24 |
+
net.load_state_dict(torch.load('./FakeVD/code_test/C3D_Feature_Extractor/c3d.pickle'))
|
25 |
+
if RUN_GPU :
|
26 |
+
net.cuda(0)
|
27 |
+
net.eval()
|
28 |
+
# print('net', net)
|
29 |
+
|
30 |
+
return net
|
31 |
+
|
32 |
+
|
33 |
+
def feature_extractor(net, OUTPUT_DIR,VIDEO_DIR,video_path=None):
|
34 |
+
crop_w = 112
|
35 |
+
resize_w = 128
|
36 |
+
crop_h = 112
|
37 |
+
resize_h = 171
|
38 |
+
nb_frames = 16
|
39 |
+
BATCH_SIZE = 10
|
40 |
+
EXTRACTED_LAYER = 6
|
41 |
+
RUN_GPU = False
|
42 |
+
|
43 |
+
#trainloader = Train_Data_Loader( VIDEO_DIR, resize_w=128, resize_h=171, crop_w = 112, crop_h = 112, nb_frames=16)
|
44 |
+
|
45 |
+
|
46 |
+
feature_dim = 4096 if EXTRACTED_LAYER != 5 else 8192
|
47 |
+
|
48 |
+
# read video list from the folder
|
49 |
+
if video_path:
|
50 |
+
video_list = [video_path]
|
51 |
+
else:
|
52 |
+
video_list = [f for f in os.listdir(VIDEO_DIR) if os.path.isfile(os.path.join(VIDEO_DIR, f))]
|
53 |
+
|
54 |
+
|
55 |
+
if not os.path.isdir(OUTPUT_DIR):
|
56 |
+
os.mkdir(OUTPUT_DIR)
|
57 |
+
# f = h5py.File(os.path.join(OUTPUT_DIR, OUTPUT_NAME), 'w')
|
58 |
+
|
59 |
+
# current location
|
60 |
+
temp_path = os.path.join(os.getcwd(), 'temp')
|
61 |
+
if not os.path.exists(temp_path):
|
62 |
+
os.mkdir(temp_path)
|
63 |
+
|
64 |
+
error_fid = open('error.txt', 'w')
|
65 |
+
for video_name in video_list:
|
66 |
+
output_file_name = video_name.split('.')[0] + '.hdf5'
|
67 |
+
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
|
68 |
+
print(output_file_name)
|
69 |
+
f = h5py.File(os.path.join(OUTPUT_DIR, output_file_name), 'w')
|
70 |
+
|
71 |
+
video_path = os.path.join(VIDEO_DIR, video_name)
|
72 |
+
print('video_path', video_path)
|
73 |
+
frame_path = os.path.join(temp_path, video_name)
|
74 |
+
if not os.path.exists(frame_path):
|
75 |
+
os.mkdir(frame_path)
|
76 |
+
|
77 |
+
|
78 |
+
print('Extracting video frames ...')
|
79 |
+
# using ffmpeg to extract video frames into a temporary folder
|
80 |
+
# example: ffmpeg -i video_validation_0000051.mp4 -q:v 2 -f image2 output/image%5d.jpg
|
81 |
+
os.system('ffmpeg -i ' + video_path + ' -q:v 2 -f image2 ' + frame_path + '/image_%5d.jpg')
|
82 |
+
|
83 |
+
|
84 |
+
print('Extracting features ...')
|
85 |
+
total_frames = len(os.listdir(frame_path))
|
86 |
+
if total_frames == 0:
|
87 |
+
error_fid.write(video_name+'\n')
|
88 |
+
print('Fail to extract frames for video: %s'%video_name)
|
89 |
+
continue
|
90 |
+
|
91 |
+
valid_frames = total_frames // nb_frames * nb_frames
|
92 |
+
n_feat = valid_frames // nb_frames # 可提取的特征数,每个特征由 nb_frames 帧组成
|
93 |
+
n_batch = n_feat // BATCH_SIZE
|
94 |
+
if n_feat - n_batch*BATCH_SIZE > 0:
|
95 |
+
n_batch = n_batch + 1
|
96 |
+
print('n_frames: %d; n_feat: %d; n_batch: %d'%(total_frames, n_feat, n_batch))
|
97 |
+
|
98 |
+
#print 'Total frames: %d'%total_frames
|
99 |
+
#print 'Total validated frames: %d'%valid_frames
|
100 |
+
#print 'NB features: %d' %(valid_frames/nb_frames)
|
101 |
+
# 随机裁剪
|
102 |
+
index_w = np.random.randint(resize_w - crop_w) ## crop
|
103 |
+
index_h = np.random.randint(resize_h - crop_h) ## crop
|
104 |
+
|
105 |
+
features = []
|
106 |
+
|
107 |
+
for i in range(n_batch-1):
|
108 |
+
input_blobs = []
|
109 |
+
for j in range(BATCH_SIZE):
|
110 |
+
clip = []
|
111 |
+
clip = np.array([resize(io.imread(os.path.join(frame_path, 'image_{:05d}.jpg'.format(k))), output_shape=(resize_w, resize_h), preserve_range=True) for k in range((i*BATCH_SIZE+j) * nb_frames+1, min((i*BATCH_SIZE+j+1) * nb_frames+1, valid_frames+1))])
|
112 |
+
# print('clip_shape', clip.shape)
|
113 |
+
clip = clip[:, index_w: index_w+ crop_w, index_h: index_h+ crop_h, :]
|
114 |
+
#print('clip_shape',clip.shape)
|
115 |
+
#print('range', range((i*BATCH_SIZE+j) * nb_frames+1, min((i*BATCH_SIZE+j+1) * nb_frames+1, valid_frames+1)))
|
116 |
+
input_blobs.append(clip)
|
117 |
+
input_blobs = np.array(input_blobs, dtype='float32')
|
118 |
+
#print('input_blobs_shape', input_blobs.shape)
|
119 |
+
input_blobs = torch.from_numpy(np.float32(input_blobs.transpose(0, 4, 1, 2, 3)))
|
120 |
+
input_blobs = Variable(input_blobs).cuda() if RUN_GPU else Variable(input_blobs)
|
121 |
+
_, batch_output = net(input_blobs, EXTRACTED_LAYER) # 输入后提取某一层
|
122 |
+
batch_feature = (batch_output.data).cpu()
|
123 |
+
features.append(batch_feature)
|
124 |
+
|
125 |
+
# The last batch
|
126 |
+
input_blobs = []
|
127 |
+
for j in range(n_feat-(n_batch-1)*BATCH_SIZE):
|
128 |
+
clip = []
|
129 |
+
clip = np.array([resize(io.imread(os.path.join(frame_path, 'image_{:05d}.jpg'.format(k))), output_shape=(resize_w, resize_h), preserve_range=True) for k in range(((n_batch-1)*BATCH_SIZE+j) * nb_frames+1, min(((n_batch-1)*BATCH_SIZE+j+1) * nb_frames+1, valid_frames+1))])
|
130 |
+
|
131 |
+
clip = clip[:, index_w: index_w+ crop_w, index_h: index_h+ crop_h, :]
|
132 |
+
#print('range', range(((n_batch-1)*BATCH_SIZE+j) * nb_frames+1, min(((n_batch-1)*BATCH_SIZE+j+1) * nb_frames+1, valid_frames+1)))
|
133 |
+
input_blobs.append(clip)
|
134 |
+
input_blobs = np.array(input_blobs, dtype='float32')
|
135 |
+
#print('input_blobs_shape', input_blobs.shape)
|
136 |
+
input_blobs = torch.from_numpy(np.float32(input_blobs.transpose(0, 4, 1, 2, 3)))
|
137 |
+
input_blobs = Variable(input_blobs).cuda() if RUN_GPU else Variable(input_blobs)
|
138 |
+
_, batch_output = net(input_blobs, EXTRACTED_LAYER)
|
139 |
+
batch_feature = (batch_output.data).cpu()
|
140 |
+
features.append(batch_feature)
|
141 |
+
|
142 |
+
features = torch.cat(features, 0)
|
143 |
+
features = features.numpy()
|
144 |
+
print('features', features.shape)
|
145 |
+
fgroup = f.create_group(video_name.split('.')[0])
|
146 |
+
fgroup.create_dataset('c3d_features', data=features)
|
147 |
+
# fgroup.create_dataset('total_frames', data=np.array(total_frames))
|
148 |
+
# fgroup.create_dataset('valid_frames', data=np.array(valid_frames))
|
149 |
+
|
150 |
+
print('%s has been processed...'%video_name)
|
151 |
+
|
152 |
+
|
153 |
+
# clear temp frame folders
|
154 |
+
try:
|
155 |
+
os.system('rm -rf ' + frame_path)
|
156 |
+
except:
|
157 |
+
pass
|
158 |
+
|
159 |
+
|
160 |
+
# for i in range(valid_frames/nb_frames) :
|
161 |
+
# clip = np.array([resize(io.imread(os.path.join(video_path, 'image_{:05d}.jpg'.format(j))), output_shape=(resize_w, resize_h), preserve_range=True) for j in range(i * nb_frames+1, (i+1) * nb_frames+1)])
|
162 |
+
# #clip = np.array([resize(video.get_data(j), output_shape=(resize_w, resize_h), preserve_range=True) for j in range(i * nb_frames, (i+1) * nb_frames)])
|
163 |
+
# clip = clip[:, index_w: index_w+ crop_w, index_h: index_h+ crop_h, :]
|
164 |
+
# clip = torch.from_numpy(np.float32(clip.transpose(3, 0, 1, 2)))
|
165 |
+
# clip = Variable(clip).cuda() if RUN_GPU else Variable(clip)
|
166 |
+
# clip = clip.resize(1, 3, nb_frames, crop_w, crop_h)
|
167 |
+
# #print('clip', clip)
|
168 |
+
# _, clip_output = net(clip, EXTRACTED_LAYER)
|
169 |
+
# #print('clip_output', clip_output)
|
170 |
+
# clip_feature = (clip_output.data).cpu()
|
171 |
+
# features.append(clip_feature)
|
172 |
+
# #features[i] = np.array(clip_feature)
|
173 |
+
# features = torch.cat(features, 0)
|
174 |
+
# features = features.numpy()
|
175 |
+
# print('features', features)
|
176 |
+
# fgroup = f.create_group(video_name)
|
177 |
+
# fgroup.create_dataset('c3d_features', data=features)
|
178 |
+
# fgroup.create_dataset('total_frames', data=np.array(total_frames))
|
179 |
+
# fgroup.create_dataset('valid_frames', data=np.array(valid_frames))
|
180 |
+
#
|
181 |
+
# #with open(os.path.join(OUTPUT_DIR, video_name[:-4]), 'wb') as f :
|
182 |
+
# # pickle.dump( features, f )
|
183 |
+
# print '%s has been processed...'%video_name
|
184 |
+
|
185 |
+
|
186 |
+
if __name__ == "__main__":
|
187 |
+
|
188 |
+
# parser = argparse.ArgumentParser()
|
189 |
+
# parser.add_argument('-o', '--OUTPUT_DIR', dest='OUTPUT_DIR', type=str, default='./output_frm/', help='Output file name')
|
190 |
+
# parser.add_argument('-l', '--EXTRACTED_LAYER', dest='EXTRACTED_LAYER', type=int, choices=[5, 6, 7], default=6, help='Feature extractor layer')
|
191 |
+
# parser.add_argument('-i', '--VIDEO_DIR', dest='VIDEO_DIR', type = str, default='./raw_video/', help='Input Video directory')
|
192 |
+
# parser.add_argument('-v', '--VIDEO_PATH', dest='VIDEO_PATH', type=str, help='Path to a single video file to process')
|
193 |
+
# parser.add_argument('-gpu', '--gpu', dest='GPU', action = 'store_true', help='Run GPU?')
|
194 |
+
# # parser.add_argument('--OUTPUT_NAME', default='c3d_features.hdf5', help='The output name of the hdf5 features')
|
195 |
+
# parser.add_argument('-b', '--BATCH_SIZE', default=10, help='the batch size')
|
196 |
+
# parser.add_argument('-id', '--gpu_id', default=0, type=int)
|
197 |
+
# args = parser.parse_args()
|
198 |
+
# params = vars(args) # convert to ordinary dict
|
199 |
+
# print('parsed parameters:')
|
200 |
+
|
201 |
+
OUTPUT_DIR = './FakeVD/code/C3D_Feature_Extractor/output_frm'
|
202 |
+
VIDEO_DIR = './FakeVD/code/C3D_Feature_Extractor/raw_video'
|
203 |
+
VIDEO_PATH = 'douyin_6571001202379590925.mp4'
|
204 |
+
# OUTPUT_NAME = params['OUTPUT_NAME']
|
205 |
+
|
206 |
+
net = load_model_c3d()
|
207 |
+
feature_extractor(net, OUTPUT_DIR, VIDEO_DIR, video_path=VIDEO_PATH)
|
208 |
+
|
209 |
+
|
FakeVD/code_test/C3D_Feature_Extractor/output_frm/douyin_6571001202379590925.hdf5
ADDED
Binary file (248 kB). View file
|
|
FakeVD/code_test/C3D_Feature_Extractor/raw_video/douyin_6571001202379590925.mp4
ADDED
Binary file (820 kB). View file
|
|
FakeVD/code_test/C3D_Feature_Extractor/raw_video/douyin_6583481991964921092.mp4
ADDED
Binary file (386 kB). View file
|
|
FakeVD/code_test/main.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import argparse
|
3 |
+
import os
|
4 |
+
import random
|
5 |
+
import warnings
|
6 |
+
warnings.filterwarnings('ignore')
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
from run import Run
|
10 |
+
|
11 |
+
parser = argparse.ArgumentParser()
|
12 |
+
parser.add_argument('--model_name', default='SVFEND', help='SVFEND/FANVM/C3D/VGG/Bbox/Vggish/Bert/TextCNN/Comments/TikTec')
|
13 |
+
parser.add_argument('--mode_eval', default= 'nocv', help='nocv/cv/temporal')
|
14 |
+
parser.add_argument('--fold', type=int, default= 1, help='needed when model_eval=nocv')
|
15 |
+
|
16 |
+
parser.add_argument('--epoches', type=int, default=30)
|
17 |
+
parser.add_argument('--batch_size', type = int, default=128)
|
18 |
+
parser.add_argument('--num_workers', type=int, default=0)
|
19 |
+
parser.add_argument('--epoch_stop', type=int, default=5)
|
20 |
+
parser.add_argument('--seed', type=int, default=2022)
|
21 |
+
parser.add_argument('--gpu', type=int, required=True)
|
22 |
+
parser.add_argument('--lr', type=float, default=0.0001)
|
23 |
+
parser.add_argument('--lambd', type=float, default=0.1)
|
24 |
+
parser.add_argument('--dropout', type=float, default=0.1)
|
25 |
+
parser.add_argument('--weight_decay', type=float, default=5e-5)
|
26 |
+
|
27 |
+
parser.add_argument('--path_param', default= './checkpoints/')
|
28 |
+
parser.add_argument('--path_tensorboard', default= './tb/')
|
29 |
+
|
30 |
+
args = parser.parse_args()
|
31 |
+
|
32 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
|
33 |
+
|
34 |
+
seed = args.seed
|
35 |
+
random.seed(seed)
|
36 |
+
np.random.seed(seed)
|
37 |
+
torch.manual_seed(seed)
|
38 |
+
torch.cuda.manual_seed(seed)
|
39 |
+
torch.backends.cudnn.benchmark = False
|
40 |
+
torch.backends.cudnn.deterministic = True
|
41 |
+
|
42 |
+
print (args)
|
43 |
+
|
44 |
+
config = {
|
45 |
+
'model_name': args.model_name,
|
46 |
+
'mode_eval':args.mode_eval,
|
47 |
+
'fold':args.fold,
|
48 |
+
|
49 |
+
'epoches': args.epoches,
|
50 |
+
'batch_size': args.batch_size,
|
51 |
+
'num_workers': args.num_workers,
|
52 |
+
'epoch_stop': args.epoch_stop,
|
53 |
+
'seed': args.seed,
|
54 |
+
'device': args.gpu,
|
55 |
+
'lr': args.lr,
|
56 |
+
'lambd': args.lambd,
|
57 |
+
'dropout': args.dropout,
|
58 |
+
'weight_decay': args.weight_decay,
|
59 |
+
|
60 |
+
'path_param': args.path_param,
|
61 |
+
'path_tensorboard': args.path_tensorboard,
|
62 |
+
}
|
63 |
+
|
64 |
+
|
65 |
+
if __name__ == '__main__':
|
66 |
+
Run(config = config
|
67 |
+
).main()
|
FakeVD/code_test/models/Baselines.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from einops import rearrange
|
5 |
+
from transformers import BertModel
|
6 |
+
from .layers import Attention
|
7 |
+
|
8 |
+
|
9 |
+
class bBbox(torch.nn.Module):
|
10 |
+
def __init__(self,fea_dim):
|
11 |
+
super(bBbox, self).__init__()
|
12 |
+
self.img_dim = 4096
|
13 |
+
self.attention1 = Attention(dim=128,heads=4)
|
14 |
+
self.attention2 = Attention(dim=128,heads=4)
|
15 |
+
|
16 |
+
self.linear_img = nn.Sequential(torch.nn.Linear(self.img_dim, fea_dim),torch.nn.ReLU())
|
17 |
+
|
18 |
+
self.classifier = nn.Linear(fea_dim,2)
|
19 |
+
|
20 |
+
def forward(self, **kwargs):
|
21 |
+
frames=kwargs['bbox_vgg']
|
22 |
+
fea_img = self.linear_img(frames)
|
23 |
+
fea_img = torch.reshape(fea_img, (-1, 45, 128))
|
24 |
+
fea_img = self.attention1(fea_img)
|
25 |
+
fea_img = torch.mean(fea_img, -2)
|
26 |
+
fea_img = torch.reshape(fea_img, (-1, 83, 128))
|
27 |
+
fea_img = self.attention2(fea_img)
|
28 |
+
fea_img = torch.mean(fea_img, -2)
|
29 |
+
output = self.classifier(fea_img)
|
30 |
+
return output, fea_img
|
31 |
+
|
32 |
+
class bC3D(torch.nn.Module):
|
33 |
+
def __init__(self,fea_dim):
|
34 |
+
super(bC3D, self).__init__()
|
35 |
+
# self.video_dim = 4096
|
36 |
+
self.video_dim = 2048
|
37 |
+
self.attention = Attention(dim=128,heads=4)
|
38 |
+
|
39 |
+
self.linear_video = nn.Sequential(torch.nn.Linear(self.video_dim, fea_dim),torch.nn.ReLU())
|
40 |
+
|
41 |
+
self.classifier = nn.Linear(fea_dim,2)
|
42 |
+
|
43 |
+
def forward(self, **kwargs):
|
44 |
+
c3d = kwargs['c3d']
|
45 |
+
fea_video = self.linear_video(c3d)
|
46 |
+
fea_video = self.attention(fea_video)
|
47 |
+
fea_video = torch.mean(fea_video, -2)
|
48 |
+
output = self.classifier(fea_video)
|
49 |
+
return output
|
50 |
+
|
51 |
+
class bVGG(torch.nn.Module):
|
52 |
+
def __init__(self,fea_dim):
|
53 |
+
super(bVGG, self).__init__()
|
54 |
+
# self.img_dim = 4096
|
55 |
+
self.img_dim = 2048
|
56 |
+
self.attention = Attention(dim=128,heads=4)
|
57 |
+
|
58 |
+
self.linear_img = nn.Sequential(torch.nn.Linear(self.img_dim, fea_dim),torch.nn.ReLU())
|
59 |
+
|
60 |
+
self.classifier = nn.Linear(fea_dim,2)
|
61 |
+
|
62 |
+
def forward(self, **kwargs):
|
63 |
+
frames=kwargs['frames']
|
64 |
+
fea_img = self.linear_img(frames)
|
65 |
+
fea_img = self.attention(fea_img)
|
66 |
+
fea_img = torch.mean(fea_img, -2)
|
67 |
+
output = self.classifier(fea_img)
|
68 |
+
return output
|
69 |
+
|
70 |
+
class bVggish(torch.nn.Module):
|
71 |
+
def __init__(self,fea_dim):
|
72 |
+
super(bVggish, self).__init__()
|
73 |
+
# self.audio_dim = 128
|
74 |
+
self.attention = Attention(dim=128,heads=4)
|
75 |
+
|
76 |
+
self.vggish_layer = torch.hub.load('./torchvggish/', 'vggish', source = 'local')
|
77 |
+
net_structure = list(self.vggish_layer.children())
|
78 |
+
self.vggish_modified = nn.Sequential(*net_structure[-2:-1])
|
79 |
+
|
80 |
+
self.classifier = nn.Linear(fea_dim,2)
|
81 |
+
|
82 |
+
def forward(self, **kwargs):
|
83 |
+
audioframes=kwargs['audioframes']
|
84 |
+
fea_audio = self.vggish_modified(audioframes)
|
85 |
+
fea_audio = self.attention(fea_audio)
|
86 |
+
fea_audio = torch.mean(fea_audio, -2)
|
87 |
+
print (fea_audio.shape)
|
88 |
+
output = self.classifier(fea_audio)
|
89 |
+
return output, fea_audio
|
90 |
+
|
91 |
+
|
92 |
+
class bBert(torch.nn.Module):
|
93 |
+
def __init__(self,bert_model,fea_dim, dropout):
|
94 |
+
super(bBert, self).__init__()
|
95 |
+
self.text_dim = 768
|
96 |
+
|
97 |
+
self.bert = BertModel.from_pretrained(bert_model).requires_grad_(False)
|
98 |
+
|
99 |
+
self.linear_text = nn.Sequential(torch.nn.Linear(self.text_dim, fea_dim),torch.nn.ReLU())
|
100 |
+
self.classifier = nn.Linear(fea_dim,2)
|
101 |
+
|
102 |
+
def forward(self, **kwargs):
|
103 |
+
title_inputid = kwargs['title_inputid']
|
104 |
+
title_mask=kwargs['title_mask']
|
105 |
+
fea_text=self.bert(title_inputid,attention_mask=title_mask)[1]
|
106 |
+
fea_text=self.linear_text(fea_text)
|
107 |
+
output = self.classifier(fea_text)
|
108 |
+
return output,fea_text
|
109 |
+
|
110 |
+
class bTextCNN(nn.Module):
|
111 |
+
def __init__(self, fea_dim, vocab_size):
|
112 |
+
super(bTextCNN, self).__init__()
|
113 |
+
self.vocab_size = vocab_size
|
114 |
+
self.fea_dim=fea_dim
|
115 |
+
|
116 |
+
self.channel_in = 1
|
117 |
+
self.filter_num = 14
|
118 |
+
self.window_size = [3,4,5]
|
119 |
+
|
120 |
+
self.textcnn =nn.ModuleList([nn.Conv2d(self.channel_in, self.filter_num, (K,self.vocab_size)) for K in self.window_size])
|
121 |
+
self.linear = nn.Sequential(torch.nn.Linear(len(self.window_size) * self.filter_num, self.fea_dim),torch.nn.ReLU())
|
122 |
+
self.classifier = nn.Linear(self.fea_dim,2)
|
123 |
+
|
124 |
+
def forward(self, **kwargs):
|
125 |
+
title_w2v = kwargs['title_w2v']
|
126 |
+
text = title_w2v.unsqueeze(1)
|
127 |
+
text = [F.relu(conv(text)).squeeze(3) for conv in self.textcnn]
|
128 |
+
text = [F.max_pool1d(i.squeeze(2), i.shape[-1]).squeeze(2) for i in text]
|
129 |
+
fea_text = torch.cat(text, 1)
|
130 |
+
fea_text = self.linear(fea_text)
|
131 |
+
|
132 |
+
output = self.classifier(fea_text)
|
133 |
+
|
134 |
+
return output
|
135 |
+
|
136 |
+
class bComments(torch.nn.Module):
|
137 |
+
def __init__(self,bert_model,fea_dim):
|
138 |
+
super(bComments, self).__init__()
|
139 |
+
self.comment_dim = 768
|
140 |
+
self.bert = BertModel.from_pretrained(bert_model).requires_grad_(False)
|
141 |
+
self.attention = Attention(dim=128,heads=4)
|
142 |
+
self.linear_comment = nn.Sequential(torch.nn.Linear(self.comment_dim, fea_dim),torch.nn.ReLU())
|
143 |
+
self.classifier = nn.Linear(fea_dim,2)
|
144 |
+
|
145 |
+
def forward(self, **kwargs):
|
146 |
+
comments_inputid = kwargs['comments_inputid']
|
147 |
+
comments_mask=kwargs['comments_mask']
|
148 |
+
comments_feature=[]
|
149 |
+
for i in range(comments_inputid.shape[0]):
|
150 |
+
bert_fea=self.bert(comments_inputid[i], attention_mask=comments_mask[i])[1]
|
151 |
+
comments_feature.append(bert_fea)
|
152 |
+
comments_feature=torch.stack(comments_feature)
|
153 |
+
fea_comments=self.linear_comment(comments_feature)
|
154 |
+
print (fea_comments.shape)
|
155 |
+
fea_comments = self.attention(fea_comments)
|
156 |
+
fea_comments = torch.mean(fea_comments, -2)
|
157 |
+
output = self.classifier(fea_comments)
|
158 |
+
return output
|
159 |
+
|
160 |
+
|
FakeVD/code_test/models/FANVM.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import os
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import torch.nn as nn
|
8 |
+
import torch.nn.functional as F
|
9 |
+
import tqdm
|
10 |
+
from sklearn.metrics import *
|
11 |
+
from transformers import BertModel
|
12 |
+
from zmq import device
|
13 |
+
|
14 |
+
from .layers import *
|
15 |
+
|
16 |
+
|
17 |
+
class TextCNN(nn.Module):
|
18 |
+
def __init__(self, fea_dim, vocab_size):
|
19 |
+
super(TextCNN, self).__init__()
|
20 |
+
self.vocab_size = vocab_size
|
21 |
+
self.fea_dim=fea_dim
|
22 |
+
|
23 |
+
self.channel_in = 1
|
24 |
+
self.filter_num = 14
|
25 |
+
self.window_size = [3,4,5]
|
26 |
+
|
27 |
+
self.textcnn =nn.ModuleList([nn.Conv2d(self.channel_in, self.filter_num, (K,self.vocab_size)) for K in self.window_size])
|
28 |
+
self.linear = nn.Sequential(torch.nn.Linear(len(self.window_size) * self.filter_num, self.fea_dim),torch.nn.ReLU())
|
29 |
+
|
30 |
+
def forward(self, inputs):
|
31 |
+
text = inputs.unsqueeze(1)
|
32 |
+
text = [F.relu(conv(text)).squeeze(3) for conv in self.textcnn]
|
33 |
+
text = [F.max_pool1d(i.squeeze(2), i.shape[-1]).squeeze(2) for i in text]
|
34 |
+
fea_text = torch.cat(text, 1)
|
35 |
+
fea_text = self.linear(fea_text)
|
36 |
+
|
37 |
+
return fea_text
|
38 |
+
|
39 |
+
|
40 |
+
class VideoEncoder(nn.Module):
|
41 |
+
def __init__(self,emb_dim,fea_dim):
|
42 |
+
super(VideoEncoder, self).__init__()
|
43 |
+
|
44 |
+
self.emb_dim = emb_dim
|
45 |
+
self.linear1 = torch.nn.Linear(self.emb_dim, self.emb_dim, bias=False)
|
46 |
+
self.linear2 = nn.Sequential(torch.nn.Linear(self.emb_dim, fea_dim),torch.nn.ReLU())
|
47 |
+
|
48 |
+
def forward(self, input_thumb, input_L):
|
49 |
+
input_ALL = torch.cat((input_L, input_thumb),1) #(bs,len+1,4096)
|
50 |
+
fea_A = torch.bmm(input_thumb,self.linear1(input_ALL).permute(0,2,1)) # (bs, 1, len+1)
|
51 |
+
fea_alpha = F.softmax(fea_A) # (bs, 1, len+1)
|
52 |
+
fea_V = torch.matmul(fea_alpha,input_ALL).squeeze() # (bs, 4096)
|
53 |
+
fea = self.linear2(fea_V)
|
54 |
+
return fea
|
55 |
+
|
56 |
+
class ReverseLayerF(Function):
|
57 |
+
#@staticmethod
|
58 |
+
def forward(self, x):
|
59 |
+
self.lambd = 1
|
60 |
+
return x.view_as(x)
|
61 |
+
|
62 |
+
#@staticmethod
|
63 |
+
def backward(self, grad_output):
|
64 |
+
return (grad_output * -self.lambd)
|
65 |
+
|
66 |
+
def grad_reverse(x):
|
67 |
+
return ReverseLayerF.apply(x)
|
68 |
+
|
69 |
+
|
70 |
+
class FANVMModel(torch.nn.Module):
|
71 |
+
def __init__(self,bert_model,fea_dim):
|
72 |
+
super(FANVMModel, self).__init__()
|
73 |
+
self.text_dim = 768
|
74 |
+
self.img_dim = 4096
|
75 |
+
self.topic_dim = 15
|
76 |
+
|
77 |
+
self.bert = BertModel.from_pretrained(bert_model).requires_grad_(False)
|
78 |
+
self.title_encoder = TextCNN(fea_dim, self.text_dim)
|
79 |
+
# self.comments_encoder = BiLSTM(self.text_dim,300,fea_dim)
|
80 |
+
self.video_encoder = VideoEncoder(self.img_dim,fea_dim)
|
81 |
+
|
82 |
+
self.gate_m1 = torch.nn.Linear(fea_dim*2,1)
|
83 |
+
self.gate_m2 = torch.nn.Linear(fea_dim*2,1)
|
84 |
+
|
85 |
+
self.classifier = nn.Linear(fea_dim*2,2)
|
86 |
+
self.classifier_topic = nn.Linear(fea_dim*3,self.topic_dim)
|
87 |
+
|
88 |
+
def forward(self, **kwargs):
|
89 |
+
title_inputid = kwargs['title_inputid']#(batch,512)
|
90 |
+
title_mask = kwargs['title_mask']#(batch,512)
|
91 |
+
fea_text = self.bert(title_inputid,attention_mask=title_mask)[0] #(bs,seq,768)
|
92 |
+
fea_text = self.title_encoder(fea_text)
|
93 |
+
fea_R = fea_text # (bs, 128)
|
94 |
+
|
95 |
+
comments_inputid = kwargs['comments_inputid']#(batch,20,250)
|
96 |
+
comments_mask=kwargs['comments_mask']#(batch,20,250)
|
97 |
+
comments_like=kwargs['comments_like']
|
98 |
+
comments_feature=[]
|
99 |
+
for i in range(comments_inputid.shape[0]):
|
100 |
+
bert_fea=self.bert(comments_inputid[i], attention_mask=comments_mask[i])[0]
|
101 |
+
comments_feature.append(self.comments_encoder(bert_fea))
|
102 |
+
comments_feature=torch.stack(comments_feature) #(batch,seq,fea_dim)
|
103 |
+
fea_comments =[]
|
104 |
+
for v in range(comments_like.shape[0]): # batch内循环
|
105 |
+
# print (reviews_like[v])
|
106 |
+
comments_weight=torch.stack([torch.true_divide((i+1),(comments_like[v].shape[0]+comments_like[v].sum())) for i in comments_like[v]])
|
107 |
+
comments_fea_reweight = torch.sum(comments_feature[v]*(comments_weight.reshape(comments_weight.shape[0],1)),dim=0)
|
108 |
+
fea_comments.append(comments_fea_reweight)
|
109 |
+
fea_comments = torch.stack(fea_comments)
|
110 |
+
fea_H = fea_comments # (bs, 600)
|
111 |
+
|
112 |
+
frames = kwargs['frames'] # (bs, 30, 4096)
|
113 |
+
frame_thumb = kwargs['frame_thmub'] # (bs,1,4096)
|
114 |
+
fea_video = self.video_encoder(frame_thumb, frames)
|
115 |
+
fea_V = fea_video # (bs, 128)
|
116 |
+
|
117 |
+
s = kwargs['s']
|
118 |
+
|
119 |
+
## fusion: title, frames
|
120 |
+
m1 = self.gate_m1(torch.cat((fea_V, fea_R),1))
|
121 |
+
fea_P = torch.add(torch.mul(m1,fea_V),torch.mul((1-m1),fea_R))
|
122 |
+
## fusion: comments, title
|
123 |
+
m2 = s.reshape((s.shape[0],1))
|
124 |
+
fea_E = torch.add(torch.mul(fea_H,m2),torch.mul(fea_R,(1-m2)))
|
125 |
+
|
126 |
+
fea_fnd = torch.cat((fea_P,fea_E),1).to(torch.float32)
|
127 |
+
output = self.classifier(fea_fnd)
|
128 |
+
|
129 |
+
fea_topic = torch.cat((fea_H, fea_R, fea_V),1)
|
130 |
+
fea_reverse = grad_reverse(fea_topic)
|
131 |
+
output_topic = self.classifier_topic(fea_reverse)
|
132 |
+
|
133 |
+
return output,output_topic,fea_fnd
|
FakeVD/code_test/models/SVFEND.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
import torch.nn as nn
|
9 |
+
import torch.nn.functional as F
|
10 |
+
import torchvision.transforms as transforms
|
11 |
+
import tqdm
|
12 |
+
from sklearn.metrics import *
|
13 |
+
from tqdm import tqdm
|
14 |
+
from transformers import AutoConfig, BertModel
|
15 |
+
from transformers.models.bert.modeling_bert import BertLayer
|
16 |
+
from zmq import device
|
17 |
+
|
18 |
+
from .coattention import *
|
19 |
+
from .layers import *
|
20 |
+
from FakeVD.code_test.utils.metrics import *
|
21 |
+
|
22 |
+
|
23 |
+
class SVFENDModel(torch.nn.Module):
|
24 |
+
def __init__(self,bert_model,fea_dim,dropout):
|
25 |
+
super(SVFENDModel, self).__init__()
|
26 |
+
self.bert = BertModel.from_pretrained("./FakeVD/Models/bert-base-chinese/").requires_grad_(False)
|
27 |
+
|
28 |
+
self.text_dim = 768
|
29 |
+
self.comment_dim = 768
|
30 |
+
self.img_dim = 4096
|
31 |
+
self.video_dim = 4096
|
32 |
+
self.num_frames = 83
|
33 |
+
self.num_audioframes = 50
|
34 |
+
self.num_comments = 23
|
35 |
+
self.dim = fea_dim
|
36 |
+
self.num_heads = 4
|
37 |
+
|
38 |
+
self.dropout = dropout
|
39 |
+
|
40 |
+
self.vggish_layer = torch.hub.load('./FakeVD/Models/torchvggish/', 'vggish', source = 'local')
|
41 |
+
net_structure = list(self.vggish_layer.children())
|
42 |
+
self.vggish_modified = nn.Sequential(*net_structure[-2:-1])
|
43 |
+
|
44 |
+
self.co_attention_ta = co_attention(d_k=fea_dim, d_v=fea_dim, n_heads=self.num_heads, dropout=self.dropout, d_model=fea_dim,
|
45 |
+
visual_len=self.num_audioframes, sen_len=512, fea_v=self.dim, fea_s=self.dim, pos=False)
|
46 |
+
self.co_attention_tv = co_attention(d_k=fea_dim, d_v=fea_dim, n_heads=self.num_heads, dropout=self.dropout, d_model=fea_dim,
|
47 |
+
visual_len=self.num_frames, sen_len=512, fea_v=self.dim, fea_s=self.dim, pos=False)
|
48 |
+
self.trm = nn.TransformerEncoderLayer(d_model = self.dim, nhead = 2, batch_first = True)
|
49 |
+
|
50 |
+
|
51 |
+
self.linear_text = nn.Sequential(torch.nn.Linear(self.text_dim, fea_dim), torch.nn.ReLU(),nn.Dropout(p=self.dropout))
|
52 |
+
self.linear_comment = nn.Sequential(torch.nn.Linear(self.comment_dim, fea_dim), torch.nn.ReLU(),nn.Dropout(p=self.dropout))
|
53 |
+
self.linear_img = nn.Sequential(torch.nn.Linear(self.img_dim, fea_dim), torch.nn.ReLU(),nn.Dropout(p=self.dropout))
|
54 |
+
self.linear_video = nn.Sequential(torch.nn.Linear(self.video_dim, fea_dim), torch.nn.ReLU(),nn.Dropout(p=self.dropout))
|
55 |
+
self.linear_intro = nn.Sequential(torch.nn.Linear(self.text_dim, fea_dim),torch.nn.ReLU(),nn.Dropout(p=self.dropout))
|
56 |
+
self.linear_audio = nn.Sequential(torch.nn.Linear(fea_dim, fea_dim), torch.nn.ReLU(),nn.Dropout(p=self.dropout))
|
57 |
+
|
58 |
+
self.classifier = nn.Linear(fea_dim,2)
|
59 |
+
|
60 |
+
def forward(self, **kwargs):
|
61 |
+
|
62 |
+
### User Intro ###
|
63 |
+
|
64 |
+
|
65 |
+
### Title ###
|
66 |
+
title_inputid = kwargs['title_inputid']#(batch,512)
|
67 |
+
title_mask=kwargs['title_mask']#(batch,512)
|
68 |
+
|
69 |
+
fea_text=self.bert(title_inputid,attention_mask=title_mask)['last_hidden_state']#(batch,sequence,768)
|
70 |
+
fea_text=self.linear_text(fea_text)
|
71 |
+
|
72 |
+
### Audio Frames ###
|
73 |
+
audioframes=kwargs['audioframes']#(batch,36,12288)
|
74 |
+
audioframes_masks = kwargs['audioframes_masks']
|
75 |
+
fea_audio = self.vggish_modified(audioframes) #(batch, frames, 128)
|
76 |
+
fea_audio = self.linear_audio(fea_audio)
|
77 |
+
fea_audio, fea_text = self.co_attention_ta(v=fea_audio, s=fea_text, v_len=fea_audio.shape[1], s_len=fea_text.shape[1])
|
78 |
+
fea_audio = torch.mean(fea_audio, -2)
|
79 |
+
|
80 |
+
### Image Frames ###
|
81 |
+
frames=kwargs['frames']#(batch,30,4096)
|
82 |
+
frames_masks = kwargs['frames_masks']
|
83 |
+
fea_img = self.linear_img(frames)
|
84 |
+
fea_img, fea_text = self.co_attention_tv(v=fea_img, s=fea_text, v_len=fea_img.shape[1], s_len=fea_text.shape[1])
|
85 |
+
fea_img = torch.mean(fea_img, -2)
|
86 |
+
|
87 |
+
fea_text = torch.mean(fea_text, -2)
|
88 |
+
|
89 |
+
### C3D ###
|
90 |
+
c3d = kwargs['c3d'] # (batch, 36, 4096)
|
91 |
+
c3d_masks = kwargs['c3d_masks']
|
92 |
+
fea_video = self.linear_video(c3d) #(batch, frames, 128)
|
93 |
+
fea_video = torch.mean(fea_video, -2)
|
94 |
+
|
95 |
+
### Comment ###
|
96 |
+
|
97 |
+
fea_text = fea_text.unsqueeze(1)
|
98 |
+
# fea_comments = fea_comments.unsqueeze(1)
|
99 |
+
fea_img = fea_img.unsqueeze(1)
|
100 |
+
fea_audio = fea_audio.unsqueeze(1)
|
101 |
+
fea_video = fea_video.unsqueeze(1)
|
102 |
+
# fea_intro = fea_intro.unsqueeze(1)
|
103 |
+
|
104 |
+
fea=torch.cat((fea_text, fea_audio, fea_video,fea_img),1) # (bs, 6, 128)
|
105 |
+
fea = self.trm(fea)
|
106 |
+
fea = torch.mean(fea, -2)
|
107 |
+
|
108 |
+
output = self.classifier(fea)
|
109 |
+
|
110 |
+
return output, fea
|
FakeVD/code_test/models/TikTec.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
class MLP(nn.Module):
|
5 |
+
def __init__(self, input_dim, hidden_dims, output_dim, dropout):
|
6 |
+
super(MLP, self).__init__()
|
7 |
+
layers = list()
|
8 |
+
curr_dim = input_dim
|
9 |
+
for hidden_dim in hidden_dims:
|
10 |
+
layers.append(nn.Linear(curr_dim, hidden_dim))
|
11 |
+
layers.append(nn.BatchNorm1d(hidden_dim))
|
12 |
+
layers.append(nn.ReLU())
|
13 |
+
layers.append(nn.Dropout(p=dropout))
|
14 |
+
curr_dim = hidden_dim
|
15 |
+
layers.append(nn.Linear(curr_dim, output_dim))
|
16 |
+
self.mlp = nn.Sequential(*layers)
|
17 |
+
|
18 |
+
def forward(self, input):
|
19 |
+
return self.mlp(input)
|
20 |
+
|
21 |
+
class MaskAvg(nn.Module):
|
22 |
+
def __init__(self):
|
23 |
+
super(MaskAvg, self).__init__()
|
24 |
+
|
25 |
+
def forward(self, input, mask):
|
26 |
+
score = torch.ones((input.shape[0], input.shape[1]), device=input.device)
|
27 |
+
score = score.masked_fill(mask == 0, float('-inf'))
|
28 |
+
score = torch.softmax(score, dim=-1).unsqueeze(1)
|
29 |
+
output = torch.matmul(score, input).squeeze(1)
|
30 |
+
return output
|
31 |
+
|
32 |
+
class CVRL(nn.Module):
|
33 |
+
def __init__(self, d_w, d_f, obj_num, gru_dim):
|
34 |
+
super(CVRL, self).__init__()
|
35 |
+
self.gru = nn.GRU(d_w, gru_dim, batch_first=True, bidirectional=True)
|
36 |
+
|
37 |
+
self.linear_r = nn.Linear(d_f, 1)
|
38 |
+
self.linear_h = nn.Linear(2*gru_dim, obj_num)
|
39 |
+
|
40 |
+
def forward(self, caption_feature, visual_feature):
|
41 |
+
# IN: caption_feature: (bs, K, S, d_w), visual_feature: (bs, K, obj_num, d_f)
|
42 |
+
# OUT: frame_visual_rep: (bs, K, d_f)
|
43 |
+
encoded_caption, _ = self.gru(caption_feature.view(-1, caption_feature.shape[-2], caption_feature.shape[-1])) # (bs*K, S, 2*gru_dim)
|
44 |
+
encoded_caption = encoded_caption.view(-1, caption_feature.shape[-3], caption_feature.shape[-2], encoded_caption.shape[-1]) # (bs, K, S, 2*gru_dim)
|
45 |
+
frame_caption_rep = encoded_caption.max(dim=2).values # (bs, K, 2*gru_dim)
|
46 |
+
|
47 |
+
alpha = self.linear_r(visual_feature).squeeze() + self.linear_h(frame_caption_rep) # (bs, K, obj_num)
|
48 |
+
alpha = torch.softmax(torch.tanh(alpha), dim=-1).unsqueeze(dim=-2) # (bs, K, 1, obj_num)
|
49 |
+
frame_visual_rep = alpha.matmul(visual_feature) # (bs, K, 1, d_f)
|
50 |
+
frame_visual_rep = frame_visual_rep.squeeze() # (bs, K, d_f)
|
51 |
+
return frame_visual_rep
|
52 |
+
|
53 |
+
class ASRL(nn.Module):
|
54 |
+
def __init__(self, d_w, gru_dim):
|
55 |
+
super(ASRL, self).__init__()
|
56 |
+
self.gru = nn.GRU(d_w, gru_dim, batch_first=True, bidirectional=True)
|
57 |
+
|
58 |
+
def forward(self, asr_feature):
|
59 |
+
# IN: asr_feature: (bs, N, d_w)
|
60 |
+
# OUT: text_audio_rep: (bs, N, 2*gru_dim)
|
61 |
+
text_audio_rep, _ = self.gru(asr_feature)
|
62 |
+
return text_audio_rep
|
63 |
+
|
64 |
+
class VCIF(nn.Module):
|
65 |
+
def __init__(self, d_f, d_w, d_H, gru_f_dim, gru_w_dim, dropout):
|
66 |
+
super(VCIF, self).__init__()
|
67 |
+
|
68 |
+
self.param_D = nn.Parameter(torch.empty((d_f, d_w)))
|
69 |
+
self.param_Df = nn.Parameter(torch.empty((d_f, d_H)))
|
70 |
+
self.param_Dw = nn.Parameter(torch.empty((d_w, d_H)))
|
71 |
+
self.param_df = nn.Parameter(torch.empty(d_H))
|
72 |
+
self.param_dw = nn.Parameter(torch.empty(d_H))
|
73 |
+
|
74 |
+
self.gru_f = nn.GRU(d_f, gru_f_dim, batch_first=True)
|
75 |
+
self.gru_w = nn.GRU(d_w, gru_w_dim, batch_first=True)
|
76 |
+
self.mask_avg = MaskAvg()
|
77 |
+
self.dropout = nn.Dropout(p=dropout)
|
78 |
+
|
79 |
+
self.reset_parameters()
|
80 |
+
|
81 |
+
def reset_parameters(self):
|
82 |
+
nn.init.xavier_uniform_(self.param_D)
|
83 |
+
nn.init.xavier_uniform_(self.param_Df)
|
84 |
+
nn.init.xavier_uniform_(self.param_Dw)
|
85 |
+
nn.init.uniform_(self.param_df)
|
86 |
+
nn.init.uniform_(self.param_dw)
|
87 |
+
|
88 |
+
def forward(self, frame_visual_rep, text_audio_rep, mask_K, mask_N):
|
89 |
+
# IN: frame_visual_rep: (bs, K, d_f), text_audio_rep: (bs, N, d_w)
|
90 |
+
# OUT: video_rep: (bs, gru_f_dim + gru_w_dim)
|
91 |
+
affinity_matrix = torch.tanh(frame_visual_rep.matmul(self.param_D).matmul(text_audio_rep.transpose(-1, -2)))
|
92 |
+
affinity_matrix = self.dropout(affinity_matrix)
|
93 |
+
|
94 |
+
frame_co_att_map = torch.tanh(frame_visual_rep.matmul(self.param_Df) + affinity_matrix.matmul(text_audio_rep).matmul(self.param_Dw))
|
95 |
+
word_co_att_map = torch.tanh(text_audio_rep.matmul(self.param_Dw) + affinity_matrix.transpose(-1, -2).matmul(frame_visual_rep).matmul(self.param_Df))
|
96 |
+
frame_co_att_map = self.dropout(frame_co_att_map)
|
97 |
+
word_co_att_map = self.dropout(word_co_att_map)
|
98 |
+
|
99 |
+
frame_att_weight = torch.softmax(frame_co_att_map.matmul(self.param_df), dim=-1)
|
100 |
+
word_att_weight = torch.softmax(word_co_att_map.matmul(self.param_dw), dim=-1)
|
101 |
+
|
102 |
+
frame_visual_weighted_rep = frame_att_weight.unsqueeze(dim=-1) * frame_visual_rep
|
103 |
+
text_audio_weighted_rep = word_att_weight.unsqueeze(dim=-1) * text_audio_rep
|
104 |
+
|
105 |
+
encoded_visual_rep, _ = self.gru_f(frame_visual_weighted_rep)
|
106 |
+
encoded_speech_rep, _ = self.gru_w(text_audio_weighted_rep)
|
107 |
+
|
108 |
+
visual_rep = self.mask_avg(encoded_visual_rep, mask_K) # (bs, gru_f_dim)
|
109 |
+
speech_rep = self.mask_avg(encoded_speech_rep, mask_N) # (bs, gru_w_dim)
|
110 |
+
|
111 |
+
video_rep = torch.cat([visual_rep, speech_rep], dim=-1)
|
112 |
+
return video_rep
|
113 |
+
|
114 |
+
class TikTecModel(nn.Module):
|
115 |
+
def __init__(self, word_dim=300, mfcc_dim=650, visual_dim=1000, obj_num=45, CVRL_gru_dim=200, ASRL_gru_dim=500, VCIF_d_H=200, VCIF_gru_f_dim=200, VCIF_gru_w_dim=100, VCIF_dropout=0.2, MLP_hidden_dims=[512], MLP_dropout=0.2):
|
116 |
+
super(TikTecModel, self).__init__()
|
117 |
+
self.CVRL = CVRL(d_w=word_dim, d_f=visual_dim, obj_num=obj_num, gru_dim=CVRL_gru_dim)
|
118 |
+
self.ASRL = ASRL(d_w=(word_dim + mfcc_dim), gru_dim=ASRL_gru_dim)
|
119 |
+
self.VCIF = VCIF(d_f=visual_dim, d_w=2*ASRL_gru_dim, d_H=VCIF_d_H, gru_f_dim=VCIF_gru_f_dim, gru_w_dim=VCIF_gru_w_dim, dropout=VCIF_dropout)
|
120 |
+
self.MLP = MLP(VCIF_gru_f_dim + VCIF_gru_w_dim, MLP_hidden_dims, 2, MLP_dropout)
|
121 |
+
|
122 |
+
def forward(self, **kwargs):
|
123 |
+
# IN:
|
124 |
+
# caption_feature: (bs, K, S, word_dim) = (bs, 200, 100, 300)
|
125 |
+
# visual_feature: (bs, K, obj_num, visual_dim) = (bs, 200, 45, 1000)
|
126 |
+
# asr_feature: (bs, N, word_dim + mfcc_dim) = (bs, 500, 300 + 650)
|
127 |
+
# mask_K: (bs, K) = (bs, 200)
|
128 |
+
# mask_N: (bs, N) = (bs, 500)
|
129 |
+
# OUT: (bs, 2)
|
130 |
+
caption_feature = kwargs['caption_feature']
|
131 |
+
visual_feature = kwargs['visual_feature']
|
132 |
+
asr_feature = kwargs['asr_feature']
|
133 |
+
mask_K = kwargs['mask_K']
|
134 |
+
mask_N = kwargs['mask_N']
|
135 |
+
|
136 |
+
frame_visual_rep = self.CVRL(caption_feature, visual_feature)
|
137 |
+
text_audio_rep = self.ASRL(asr_feature)
|
138 |
+
video_rep = self.VCIF(frame_visual_rep, text_audio_rep, mask_K, mask_N)
|
139 |
+
output = self.MLP(video_rep)
|
140 |
+
return output
|
FakeVD/code_test/models/Trainer.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
from tkinter import E
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
import torch.nn as nn
|
10 |
+
import torch.nn.functional as F
|
11 |
+
import torchvision.transforms as transforms
|
12 |
+
import tqdm
|
13 |
+
from sklearn.metrics import *
|
14 |
+
from tqdm import tqdm
|
15 |
+
from transformers import BertModel
|
16 |
+
from FakeVD.code_test.utils.metrics import *
|
17 |
+
from zmq import device
|
18 |
+
|
19 |
+
from .coattention import *
|
20 |
+
from .layers import *
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
class Trainer():
|
25 |
+
def __init__(self,
|
26 |
+
model,
|
27 |
+
device,
|
28 |
+
lr,
|
29 |
+
dropout,
|
30 |
+
dataloaders,
|
31 |
+
weight_decay,
|
32 |
+
save_param_path,
|
33 |
+
writer,
|
34 |
+
epoch_stop,
|
35 |
+
epoches,
|
36 |
+
mode,
|
37 |
+
model_name,
|
38 |
+
event_num,
|
39 |
+
save_threshold = 0.0,
|
40 |
+
start_epoch = 0,
|
41 |
+
):
|
42 |
+
|
43 |
+
self.model = model
|
44 |
+
self.device = device
|
45 |
+
self.mode = mode
|
46 |
+
self.model_name = model_name
|
47 |
+
self.event_num = event_num
|
48 |
+
|
49 |
+
self.dataloaders = dataloaders
|
50 |
+
self.start_epoch = start_epoch
|
51 |
+
self.num_epochs = epoches
|
52 |
+
self.epoch_stop = epoch_stop
|
53 |
+
self.save_threshold = save_threshold
|
54 |
+
self.writer = writer
|
55 |
+
|
56 |
+
if os.path.exists(save_param_path):
|
57 |
+
self.save_param_path = save_param_path
|
58 |
+
else:
|
59 |
+
self.save_param_path = os.makedirs(save_param_path)
|
60 |
+
self.save_param_path= save_param_path
|
61 |
+
|
62 |
+
self.lr = lr
|
63 |
+
self.weight_decay = weight_decay
|
64 |
+
self.dropout = dropout
|
65 |
+
|
66 |
+
self.criterion = nn.CrossEntropyLoss()
|
67 |
+
|
68 |
+
|
69 |
+
def train(self):
|
70 |
+
|
71 |
+
since = time.time()
|
72 |
+
|
73 |
+
self.model.cuda()
|
74 |
+
|
75 |
+
best_model_wts_test = copy.deepcopy(self.model.state_dict())
|
76 |
+
best_acc_test = 0.0
|
77 |
+
best_epoch_test = 0
|
78 |
+
is_earlystop = False
|
79 |
+
|
80 |
+
if self.mode == "eann":
|
81 |
+
best_acc_test_event = 0.0
|
82 |
+
best_epoch_test_event = 0
|
83 |
+
|
84 |
+
for epoch in range(self.start_epoch, self.start_epoch+self.num_epochs):
|
85 |
+
if is_earlystop:
|
86 |
+
break
|
87 |
+
print('-' * 50)
|
88 |
+
print('Epoch {}/{}'.format(epoch+1, self.start_epoch+self.num_epochs))
|
89 |
+
print('-' * 50)
|
90 |
+
|
91 |
+
p = float(epoch) / 100
|
92 |
+
lr = self.lr / (1. + 10 * p) ** 0.75
|
93 |
+
self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=lr)
|
94 |
+
|
95 |
+
for phase in ['train', 'test']:
|
96 |
+
if phase == 'train':
|
97 |
+
self.model.train()
|
98 |
+
else:
|
99 |
+
self.model.eval()
|
100 |
+
print('-' * 10)
|
101 |
+
print (phase.upper())
|
102 |
+
print('-' * 10)
|
103 |
+
|
104 |
+
running_loss_fnd = 0.0
|
105 |
+
running_loss = 0.0
|
106 |
+
tpred = []
|
107 |
+
tlabel = []
|
108 |
+
|
109 |
+
if self.mode == "eann":
|
110 |
+
running_loss_event = 0.0
|
111 |
+
tpred_event = []
|
112 |
+
tlabel_event = []
|
113 |
+
|
114 |
+
for batch in tqdm(self.dataloaders[phase]):
|
115 |
+
batch_data=batch
|
116 |
+
for k,v in batch_data.items():
|
117 |
+
batch_data[k]=v.cuda()
|
118 |
+
label = batch_data['label']
|
119 |
+
if self.mode == "eann":
|
120 |
+
label_event = batch_data['label_event']
|
121 |
+
|
122 |
+
|
123 |
+
with torch.set_grad_enabled(phase == 'train'):
|
124 |
+
if self.mode == "eann":
|
125 |
+
outputs, outputs_event,fea = self.model(**batch_data)
|
126 |
+
loss_fnd = self.criterion(outputs, label)
|
127 |
+
loss_event = self.criterion(outputs_event, label_event)
|
128 |
+
loss = loss_fnd + loss_event
|
129 |
+
_, preds = torch.max(outputs, 1)
|
130 |
+
_, preds_event = torch.max(outputs_event, 1)
|
131 |
+
else:
|
132 |
+
outputs,fea = self.model(**batch_data)
|
133 |
+
_, preds = torch.max(outputs, 1)
|
134 |
+
loss = self.criterion(outputs, label)
|
135 |
+
|
136 |
+
if phase == 'train':
|
137 |
+
loss.backward()
|
138 |
+
self.optimizer.step()
|
139 |
+
self.optimizer.zero_grad()
|
140 |
+
|
141 |
+
tlabel.extend(label.detach().cpu().numpy().tolist())
|
142 |
+
tpred.extend(preds.detach().cpu().numpy().tolist())
|
143 |
+
running_loss += loss.item() * label.size(0)
|
144 |
+
|
145 |
+
if self.mode == "eann":
|
146 |
+
tlabel_event.extend(label_event.detach().cpu().numpy().tolist())
|
147 |
+
tpred_event.extend(preds_event.detach().cpu().numpy().tolist())
|
148 |
+
running_loss_event += loss_event.item() * label_event.size(0)
|
149 |
+
running_loss_fnd += loss_fnd.item() * label.size(0)
|
150 |
+
|
151 |
+
epoch_loss = running_loss / len(self.dataloaders[phase].dataset)
|
152 |
+
print('Loss: {:.4f} '.format(epoch_loss))
|
153 |
+
results = metrics(tlabel, tpred)
|
154 |
+
print (results)
|
155 |
+
self.writer.add_scalar('Loss/'+phase, epoch_loss, epoch+1)
|
156 |
+
self.writer.add_scalar('Acc/'+phase, results['acc'], epoch+1)
|
157 |
+
self.writer.add_scalar('F1/'+phase, results['f1'], epoch+1)
|
158 |
+
|
159 |
+
if self.mode == "eann":
|
160 |
+
epoch_loss_fnd = running_loss_fnd / len(self.dataloaders[phase].dataset)
|
161 |
+
print('Loss_fnd: {:.4f} '.format(epoch_loss_fnd))
|
162 |
+
epoch_loss_event = running_loss_event / len(self.dataloaders[phase].dataset)
|
163 |
+
print('Loss_event: {:.4f} '.format(epoch_loss_event))
|
164 |
+
self.writer.add_scalar('Loss_fnd/'+phase, epoch_loss_fnd, epoch+1)
|
165 |
+
self.writer.add_scalar('Loss_event/'+phase, epoch_loss_event, epoch+1)
|
166 |
+
|
167 |
+
if phase == 'test':
|
168 |
+
if results['acc'] > best_acc_test:
|
169 |
+
best_acc_test = results['acc']
|
170 |
+
best_model_wts_test = copy.deepcopy(self.model.state_dict())
|
171 |
+
best_epoch_test = epoch+1
|
172 |
+
if best_acc_test > self.save_threshold:
|
173 |
+
torch.save(self.model.state_dict(), self.save_param_path + "_test_epoch" + str(best_epoch_test) + "_{0:.4f}".format(best_acc_test))
|
174 |
+
print ("saved " + self.save_param_path + "_test_epoch" + str(best_epoch_test) + "_{0:.4f}".format(best_acc_test) )
|
175 |
+
else:
|
176 |
+
if epoch-best_epoch_test >= self.epoch_stop-1:
|
177 |
+
is_earlystop = True
|
178 |
+
print ("early stopping...")
|
179 |
+
|
180 |
+
time_elapsed = time.time() - since
|
181 |
+
print('Training complete in {:.0f}m {:.0f}s'.format(
|
182 |
+
time_elapsed // 60, time_elapsed % 60))
|
183 |
+
print("Best model on test: epoch" + str(best_epoch_test) + "_" + str(best_acc_test))
|
184 |
+
|
185 |
+
if self.mode == "eann":
|
186 |
+
print("Event: Best model on test: epoch" + str(best_epoch_test_event) + "_" + str(best_acc_test_event))
|
187 |
+
|
188 |
+
self.model.load_state_dict(best_model_wts_test)
|
189 |
+
return self.test()
|
190 |
+
|
191 |
+
|
192 |
+
def test(self):
|
193 |
+
since = time.time()
|
194 |
+
|
195 |
+
self.model.cuda()
|
196 |
+
self.model.eval()
|
197 |
+
|
198 |
+
pred = []
|
199 |
+
label = []
|
200 |
+
|
201 |
+
if self.mode == "eann":
|
202 |
+
pred_event = []
|
203 |
+
label_event = []
|
204 |
+
|
205 |
+
for batch in tqdm(self.dataloaders['test']):
|
206 |
+
with torch.no_grad():
|
207 |
+
batch_data=batch
|
208 |
+
for k,v in batch_data.items():
|
209 |
+
batch_data[k]=v.cuda()
|
210 |
+
batch_label = batch_data['label']
|
211 |
+
|
212 |
+
if self.mode == "eann":
|
213 |
+
batch_label_event = batch_data['label_event']
|
214 |
+
batch_outputs, batch_outputs_event, fea = self.model(**batch_data)
|
215 |
+
_, batch_preds_event = torch.max(batch_outputs_event, 1)
|
216 |
+
|
217 |
+
label_event.extend(batch_label_event.detach().cpu().numpy().tolist())
|
218 |
+
pred_event.extend(batch_preds_event.detach().cpu().numpy().tolist())
|
219 |
+
else:
|
220 |
+
batch_outputs,fea = self.model(**batch_data)
|
221 |
+
|
222 |
+
_, batch_preds = torch.max(batch_outputs, 1)
|
223 |
+
|
224 |
+
label.extend(batch_label.detach().cpu().numpy().tolist())
|
225 |
+
pred.extend(batch_preds.detach().cpu().numpy().tolist())
|
226 |
+
|
227 |
+
print (get_confusionmatrix_fnd(np.array(pred), np.array(label)))
|
228 |
+
print (metrics(label, pred))
|
229 |
+
|
230 |
+
if self.mode == "eann" and self.model_name != "FANVM":
|
231 |
+
print ("event:")
|
232 |
+
print (accuracy_score(np.array(label_event), np.array(pred_event)))
|
233 |
+
|
234 |
+
return metrics(label, pred)
|
235 |
+
|
FakeVD/code_test/models/Trainer_3set.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
import torch.nn as nn
|
9 |
+
import torch.nn.functional as F
|
10 |
+
import torchvision.transforms as transforms
|
11 |
+
import tqdm
|
12 |
+
from sklearn.metrics import *
|
13 |
+
from tqdm import tqdm
|
14 |
+
from transformers import BertModel
|
15 |
+
from FakeVD.code_test.utils.metrics import *
|
16 |
+
from zmq import device
|
17 |
+
|
18 |
+
from .coattention import *
|
19 |
+
from .layers import *
|
20 |
+
|
21 |
+
|
22 |
+
class Trainer3():
|
23 |
+
def __init__(self,
|
24 |
+
model,
|
25 |
+
device,
|
26 |
+
lr,
|
27 |
+
dropout,
|
28 |
+
dataloaders,
|
29 |
+
weight_decay,
|
30 |
+
save_param_path,
|
31 |
+
writer,
|
32 |
+
epoch_stop,
|
33 |
+
epoches,
|
34 |
+
mode,
|
35 |
+
model_name,
|
36 |
+
event_num,
|
37 |
+
save_threshold = 0.0,
|
38 |
+
start_epoch = 0,
|
39 |
+
):
|
40 |
+
|
41 |
+
self.model = model
|
42 |
+
|
43 |
+
self.device = device
|
44 |
+
self.mode = mode
|
45 |
+
self.model_name = model_name
|
46 |
+
self.event_num = event_num
|
47 |
+
|
48 |
+
self.dataloaders = dataloaders
|
49 |
+
self.start_epoch = start_epoch
|
50 |
+
self.num_epochs = epoches
|
51 |
+
self.epoch_stop = epoch_stop
|
52 |
+
self.save_threshold = save_threshold
|
53 |
+
self.writer = writer
|
54 |
+
|
55 |
+
if os.path.exists(save_param_path):
|
56 |
+
self.save_param_path = save_param_path
|
57 |
+
else:
|
58 |
+
self.save_param_path = os.makedirs(save_param_path)
|
59 |
+
self.save_param_path= save_param_path
|
60 |
+
|
61 |
+
self.lr = lr
|
62 |
+
self.weight_decay = weight_decay
|
63 |
+
self.dropout = dropout
|
64 |
+
|
65 |
+
self.criterion = nn.CrossEntropyLoss()
|
66 |
+
|
67 |
+
|
68 |
+
def train(self):
|
69 |
+
|
70 |
+
since = time.time()
|
71 |
+
|
72 |
+
self.model.cuda()
|
73 |
+
|
74 |
+
best_model_wts_val = copy.deepcopy(self.model.state_dict())
|
75 |
+
best_acc_val = 0.0
|
76 |
+
best_epoch_val = 0
|
77 |
+
|
78 |
+
is_earlystop = False
|
79 |
+
|
80 |
+
if self.mode == "eann":
|
81 |
+
best_acc_val_event = 0.0
|
82 |
+
best_epoch_val_event = 0
|
83 |
+
|
84 |
+
for epoch in range(self.start_epoch, self.start_epoch+self.num_epochs):
|
85 |
+
if is_earlystop:
|
86 |
+
break
|
87 |
+
print('-' * 50)
|
88 |
+
print('Epoch {}/{}'.format(epoch+1, self.start_epoch+self.num_epochs))
|
89 |
+
print('-' * 50)
|
90 |
+
|
91 |
+
p = float(epoch) / 100
|
92 |
+
lr = self.lr / (1. + 10 * p) ** 0.75
|
93 |
+
self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=lr)
|
94 |
+
|
95 |
+
for phase in ['train', 'val', 'test']:
|
96 |
+
if phase == 'train':
|
97 |
+
self.model.train()
|
98 |
+
else:
|
99 |
+
self.model.eval()
|
100 |
+
print('-' * 10)
|
101 |
+
print (phase.upper())
|
102 |
+
print('-' * 10)
|
103 |
+
|
104 |
+
running_loss_fnd = 0.0
|
105 |
+
running_loss = 0.0
|
106 |
+
tpred = []
|
107 |
+
tlabel = []
|
108 |
+
|
109 |
+
if self.mode == "eann":
|
110 |
+
running_loss_event = 0.0
|
111 |
+
tpred_event = []
|
112 |
+
tlabel_event = []
|
113 |
+
|
114 |
+
for batch in tqdm(self.dataloaders[phase]):
|
115 |
+
batch_data=batch
|
116 |
+
for k,v in batch_data.items():
|
117 |
+
batch_data[k]=v.cuda()
|
118 |
+
label = batch_data['label']
|
119 |
+
if self.mode == "eann":
|
120 |
+
label_event = batch_data['label_event']
|
121 |
+
|
122 |
+
self.optimizer.zero_grad()
|
123 |
+
|
124 |
+
with torch.set_grad_enabled(phase == 'train'):
|
125 |
+
if self.mode == "eann":
|
126 |
+
outputs, outputs_event,fea = self.model(**batch_data)
|
127 |
+
loss_fnd = self.criterion(outputs, label)
|
128 |
+
loss_event = self.criterion(outputs_event, label_event)
|
129 |
+
loss = loss_fnd + loss_event
|
130 |
+
_, preds = torch.max(outputs, 1)
|
131 |
+
_, preds_event = torch.max(outputs_event, 1)
|
132 |
+
else:
|
133 |
+
outputs,fea = self.model(**batch_data)
|
134 |
+
_, preds = torch.max(outputs, 1)
|
135 |
+
loss = self.criterion(outputs, label)
|
136 |
+
|
137 |
+
if phase == 'train':
|
138 |
+
loss.backward()
|
139 |
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
|
140 |
+
self.optimizer.step()
|
141 |
+
self.optimizer.zero_grad()
|
142 |
+
|
143 |
+
tlabel.extend(label.detach().cpu().numpy().tolist())
|
144 |
+
tpred.extend(preds.detach().cpu().numpy().tolist())
|
145 |
+
running_loss += loss.item() * label.size(0)
|
146 |
+
|
147 |
+
if self.mode == "eann":
|
148 |
+
tlabel_event.extend(label_event.detach().cpu().numpy().tolist())
|
149 |
+
tpred_event.extend(preds_event.detach().cpu().numpy().tolist())
|
150 |
+
running_loss_event += loss_event.item() * label_event.size(0)
|
151 |
+
running_loss_fnd += loss_fnd.item() * label.size(0)
|
152 |
+
|
153 |
+
epoch_loss = running_loss / len(self.dataloaders[phase].dataset)
|
154 |
+
print('Loss: {:.4f} '.format(epoch_loss))
|
155 |
+
results = metrics(tlabel, tpred)
|
156 |
+
print (results)
|
157 |
+
self.writer.add_scalar('Loss/'+phase, epoch_loss, epoch+1)
|
158 |
+
self.writer.add_scalar('Acc/'+phase, results['acc'], epoch+1)
|
159 |
+
self.writer.add_scalar('F1/'+phase, results['f1'], epoch+1)
|
160 |
+
|
161 |
+
if self.mode == "eann":
|
162 |
+
epoch_loss_fnd = running_loss_fnd / len(self.dataloaders[phase].dataset)
|
163 |
+
print('Loss_fnd: {:.4f} '.format(epoch_loss_fnd))
|
164 |
+
epoch_loss_event = running_loss_event / len(self.dataloaders[phase].dataset)
|
165 |
+
print('Loss_event: {:.4f} '.format(epoch_loss_event))
|
166 |
+
self.writer.add_scalar('Loss_fnd/'+phase, epoch_loss_fnd, epoch+1)
|
167 |
+
self.writer.add_scalar('Loss_event/'+phase, epoch_loss_event, epoch+1)
|
168 |
+
|
169 |
+
if phase == 'val' and results['acc'] > best_acc_val:
|
170 |
+
best_acc_val = results['acc']
|
171 |
+
best_model_wts_val = copy.deepcopy(self.model.state_dict())
|
172 |
+
best_epoch_val = epoch+1
|
173 |
+
if best_acc_val > self.save_threshold:
|
174 |
+
torch.save(self.model.state_dict(), self.save_param_path + "_val_epoch" + str(best_epoch_val) + "_{0:.4f}".format(best_acc_val))
|
175 |
+
print ("saved " + self.save_param_path + "_val_epoch" + str(best_epoch_val) + "_{0:.4f}".format(best_acc_val) )
|
176 |
+
else:
|
177 |
+
if epoch-best_epoch_val >= self.epoch_stop-1:
|
178 |
+
is_earlystop = True
|
179 |
+
print ("early stopping...")
|
180 |
+
|
181 |
+
time_elapsed = time.time() - since
|
182 |
+
print('Training complete in {:.0f}m {:.0f}s'.format(
|
183 |
+
time_elapsed // 60, time_elapsed % 60))
|
184 |
+
print("Best model on val: epoch" + str(best_epoch_val) + "_" + str(best_acc_val))
|
185 |
+
|
186 |
+
if self.mode == "eann":
|
187 |
+
print("Event: Best model on val: epoch" + str(best_epoch_val_event) + "_" + str(best_acc_val_event))
|
188 |
+
|
189 |
+
|
190 |
+
self.model.load_state_dict(best_model_wts_val)
|
191 |
+
|
192 |
+
print ("test result when using best model on val")
|
193 |
+
return self.test()
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
def test(self):
|
198 |
+
since = time.time()
|
199 |
+
|
200 |
+
self.model.cuda()
|
201 |
+
self.model.eval()
|
202 |
+
|
203 |
+
pred = []
|
204 |
+
label = []
|
205 |
+
|
206 |
+
if self.mode == "eann":
|
207 |
+
pred_event = []
|
208 |
+
label_event = []
|
209 |
+
|
210 |
+
for batch in tqdm(self.dataloaders['test']):
|
211 |
+
with torch.no_grad():
|
212 |
+
batch_data=batch
|
213 |
+
for k,v in batch_data.items():
|
214 |
+
batch_data[k]=v.cuda()
|
215 |
+
batch_label = batch_data['label']
|
216 |
+
|
217 |
+
if self.mode == "eann":
|
218 |
+
batch_label_event = batch_data['label_event']
|
219 |
+
batch_outputs, batch_outputs_event, fea = self.model(**batch_data)
|
220 |
+
_, batch_preds_event = torch.max(batch_outputs_event, 1)
|
221 |
+
|
222 |
+
label_event.extend(batch_label_event.detach().cpu().numpy().tolist())
|
223 |
+
pred_event.extend(batch_preds_event.detach().cpu().numpy().tolist())
|
224 |
+
else:
|
225 |
+
batch_outputs,fea = self.model(**batch_data)
|
226 |
+
|
227 |
+
_, batch_preds = torch.max(batch_outputs, 1)
|
228 |
+
|
229 |
+
label.extend(batch_label.detach().cpu().numpy().tolist())
|
230 |
+
pred.extend(batch_preds.detach().cpu().numpy().tolist())
|
231 |
+
|
232 |
+
|
233 |
+
print (get_confusionmatrix_fnd(np.array(pred), np.array(label)))
|
234 |
+
print (metrics(label, pred))
|
235 |
+
|
236 |
+
if self.mode == "eann" and self.model_name != "FANVM":
|
237 |
+
print ("event:")
|
238 |
+
print (accuracy_score(np.array(label_event), np.array(pred_event)))
|
239 |
+
|
240 |
+
return metrics(label, pred)
|
241 |
+
|
FakeVD/code_test/models/coattention.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
from .trm import *
|
5 |
+
|
6 |
+
|
7 |
+
class _MultiHeadAttention(nn.Module):
|
8 |
+
def __init__(self, d_k, d_v, d_model, n_heads, dropout):
|
9 |
+
super(_MultiHeadAttention, self).__init__()
|
10 |
+
self.d_k = d_k
|
11 |
+
self.d_v = d_v
|
12 |
+
self.d_model = d_model
|
13 |
+
self.n_heads = n_heads
|
14 |
+
|
15 |
+
self.w_q = Linear(d_model, d_k * n_heads)
|
16 |
+
self.w_k = Linear(d_model, d_k * n_heads)
|
17 |
+
self.w_v = Linear(d_model, d_v * n_heads)
|
18 |
+
|
19 |
+
def forward(self, q, k, v):
|
20 |
+
# q: [b_size x len_q x d_model]
|
21 |
+
# k: [b_size x len_k x d_model]
|
22 |
+
# v: [b_size x len_k x d_model]
|
23 |
+
b_size = q.size(0)
|
24 |
+
|
25 |
+
# q_s: [b_size x n_heads x len_q x d_k]
|
26 |
+
# k_s: [b_size x n_heads x len_k x d_k]
|
27 |
+
# v_s: [b_size x n_heads x len_k x d_v]
|
28 |
+
q_s = self.w_q(q).view(b_size, -1, self.n_heads, self.d_k).transpose(1, 2)
|
29 |
+
k_s = self.w_k(k).view(b_size, -1, self.n_heads, self.d_k).transpose(1, 2)
|
30 |
+
v_s = self.w_v(v).view(b_size, -1, self.n_heads, self.d_v).transpose(1, 2)
|
31 |
+
return q_s, k_s, v_s
|
32 |
+
|
33 |
+
class PoswiseFeedForwardNet(nn.Module):
|
34 |
+
def __init__(self, d_model, d_ff, dropout=0.1):
|
35 |
+
super(PoswiseFeedForwardNet, self).__init__()
|
36 |
+
self.relu = nn.ReLU()
|
37 |
+
self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
|
38 |
+
self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
|
39 |
+
self.dropout = nn.Dropout(dropout)
|
40 |
+
self.layer_norm = LayerNormalization(d_model)
|
41 |
+
|
42 |
+
def forward(self, inputs):
|
43 |
+
# inputs: [b_size x len_q x d_model]
|
44 |
+
residual = inputs
|
45 |
+
output = self.relu(self.conv1(inputs.transpose(1, 2)))
|
46 |
+
|
47 |
+
# outputs: [b_size x len_q x d_model]
|
48 |
+
output = self.conv2(output).transpose(1, 2)
|
49 |
+
output = self.dropout(output)
|
50 |
+
|
51 |
+
return self.layer_norm(residual + output)
|
52 |
+
|
53 |
+
class MultiHeadAttention(nn.Module):
|
54 |
+
def __init__(self, d_k, d_v, n_heads, dropout, d_model, visual_len, sen_len, fea_v, fea_s, pos):
|
55 |
+
super(MultiHeadAttention, self).__init__()
|
56 |
+
self.n_heads = n_heads
|
57 |
+
self.multihead_attn_v = _MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout)
|
58 |
+
self.multihead_attn_s = _MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout)
|
59 |
+
self.pos_emb_v = PosEncoding(visual_len * 10, d_model)
|
60 |
+
self.pos_emb_s = PosEncoding(sen_len * 10, d_model)
|
61 |
+
self.linear_v = nn.Linear(in_features=fea_v, out_features=d_model)
|
62 |
+
self.linear_s = nn.Linear(in_features=fea_s, out_features=d_model)
|
63 |
+
self.proj_v = Linear(n_heads * d_v, d_model)
|
64 |
+
self.proj_s = Linear(n_heads * d_v, d_model)
|
65 |
+
self.d_v = d_v
|
66 |
+
self.dropout = nn.Dropout(dropout)
|
67 |
+
self.layer_norm_v = LayerNormalization(d_model)
|
68 |
+
self.layer_norm_s = LayerNormalization(d_model)
|
69 |
+
self.attention = ScaledDotProductAttention(d_k, dropout)
|
70 |
+
self.pos = pos
|
71 |
+
|
72 |
+
def forward(self, v, s, v_len, s_len):
|
73 |
+
b_size = v.size(0)
|
74 |
+
# q: [b_size x len_q x d_model]
|
75 |
+
# k: [b_size x len_k x d_model]
|
76 |
+
# v: [b_size x len_v x d_model] note (len_k == len_v)
|
77 |
+
v, s = self.linear_v(v), self.linear_s(s)
|
78 |
+
if self.pos:
|
79 |
+
pos_v, pos_s = self.pos_emb_v(v_len), self.pos_emb_s(s_len)
|
80 |
+
residual_v, residual_s = v + pos_v, s + pos_s
|
81 |
+
else:
|
82 |
+
residual_v, residual_s = v, s
|
83 |
+
# context: a tensor of shape [b_size x len_q x n_heads * d_v]
|
84 |
+
q_v, k_v, v_v = self.multihead_attn_v(v, v, v)
|
85 |
+
q_s, k_s, v_s = self.multihead_attn_s(s, s, s)
|
86 |
+
context_v, attn_v = self.attention(q_v, k_s, v_s)
|
87 |
+
context_s, attn_s = self.attention(q_s, k_v, v_v)
|
88 |
+
context_v = context_v.transpose(1, 2).contiguous().view(b_size, -1, self.n_heads * self.d_v)
|
89 |
+
context_s = context_s.transpose(1, 2).contiguous().view(b_size, -1, self.n_heads * self.d_v)
|
90 |
+
# project back to the residual size, outputs: [b_size x len_q x d_model]
|
91 |
+
output_v = self.dropout(self.proj_v(context_v))
|
92 |
+
output_s = self.dropout(self.proj_s(context_s))
|
93 |
+
return self.layer_norm_v(residual_v + output_v), self.layer_norm_s(residual_s + output_s)
|
94 |
+
|
95 |
+
class co_attention(nn.Module):
|
96 |
+
def __init__(self, d_k, d_v, n_heads, dropout, d_model, visual_len, sen_len, fea_v, fea_s, pos):
|
97 |
+
super(co_attention, self).__init__()
|
98 |
+
# self.layer_num = layer_num
|
99 |
+
# self.multi_head = MultiHeadAttention(d_k=d_k, d_v=d_v, n_heads=n_heads, dropout=dropout, d_model=d_model,
|
100 |
+
# visual_len=visual_len, sen_len=sen_len, fea_v=fea_v, fea_s=fea_s, pos=False)
|
101 |
+
# self.PoswiseFeedForwardNet_v = nn.ModuleList([PoswiseFeedForwardNet(d_model=d_model, d_ff=256)])
|
102 |
+
# self.PoswiseFeedForwardNet_s = nn.ModuleList([PoswiseFeedForwardNet(d_model=d_model, d_ff=256)])
|
103 |
+
# self.multi_head = nn.ModuleList([MultiHeadAttention(d_k=d_k, d_v=d_v, n_heads=n_heads, dropout=dropout, d_model=d_model,
|
104 |
+
# visual_len=visual_len, sen_len=sen_len, fea_v=fea_v, fea_s=fea_s, pos=False)])
|
105 |
+
# for i in range(1, layer_num):
|
106 |
+
# self.PoswiseFeedForwardNet_v.append(PoswiseFeedForwardNet(d_model=d_model, d_ff=256))
|
107 |
+
# self.PoswiseFeedForwardNet_s.append(PoswiseFeedForwardNet(d_model=d_model, d_ff=256))
|
108 |
+
# self.multi_head.append(MultiHeadAttention(d_k=d_k, d_v=d_v, n_heads=n_heads, dropout=dropout, d_model=d_model,
|
109 |
+
# visual_len=visual_len, sen_len=sen_len, fea_v=d_model, fea_s=d_model, pos=True))
|
110 |
+
self.multi_head = MultiHeadAttention(d_k=d_k, d_v=d_v, n_heads=n_heads, dropout=dropout, d_model=d_model,
|
111 |
+
visual_len=visual_len, sen_len=sen_len, fea_v=fea_v, fea_s=fea_s, pos=pos)
|
112 |
+
self.PoswiseFeedForwardNet_v = PoswiseFeedForwardNet(d_model=d_model, d_ff=128, dropout=dropout)
|
113 |
+
self.PoswiseFeedForwardNet_s = PoswiseFeedForwardNet(d_model=d_model, d_ff=128,dropout=dropout)
|
114 |
+
def forward(self, v, s, v_len, s_len):
|
115 |
+
# for i in range(self.layer_num):
|
116 |
+
# v, s = self.multi_head[i](v, s, v_len, s_len)
|
117 |
+
# v = self.PoswiseFeedForwardNet_v[i](v)
|
118 |
+
# s = self.PoswiseFeedForwardNet_s[i](s)
|
119 |
+
v, s = self.multi_head(v, s, v_len, s_len)
|
120 |
+
v = self.PoswiseFeedForwardNet_v(v)
|
121 |
+
s = self.PoswiseFeedForwardNet_s(s)
|
122 |
+
return v, s
|
FakeVD/code_test/models/layers.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from einops import rearrange
|
8 |
+
from torch.autograd import Function
|
9 |
+
|
10 |
+
|
11 |
+
class ReverseLayerF(Function):
|
12 |
+
@staticmethod
|
13 |
+
def forward(ctx, input_, alpha):
|
14 |
+
ctx.alpha = alpha
|
15 |
+
return input_
|
16 |
+
|
17 |
+
@staticmethod
|
18 |
+
def backward(ctx, grad_output):
|
19 |
+
output = grad_output.neg() * ctx.alpha
|
20 |
+
return output, None
|
21 |
+
|
22 |
+
|
23 |
+
class Attention(nn.Module):
|
24 |
+
def __init__(self, dim, heads = 2, dim_head = 64, dropout = 0.):
|
25 |
+
super().__init__()
|
26 |
+
inner_dim = dim_head * heads
|
27 |
+
project_out = not (heads == 1 and dim_head == dim)
|
28 |
+
|
29 |
+
self.heads = heads
|
30 |
+
self.scale = dim_head ** -0.5
|
31 |
+
|
32 |
+
self.attend = nn.Softmax(dim = -1)
|
33 |
+
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
|
34 |
+
|
35 |
+
self.to_out = nn.Sequential(
|
36 |
+
nn.Linear(inner_dim, dim),
|
37 |
+
nn.Dropout(dropout)
|
38 |
+
) if project_out else nn.Identity()
|
39 |
+
|
40 |
+
def forward(self, x):
|
41 |
+
qkv = self.to_qkv(x).chunk(3, dim = -1)
|
42 |
+
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
|
43 |
+
|
44 |
+
dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
|
45 |
+
|
46 |
+
attn = self.attend(dots)
|
47 |
+
|
48 |
+
out = torch.matmul(attn, v)
|
49 |
+
out = rearrange(out, 'b h n d -> b n (h d)')
|
50 |
+
return self.to_out(out)
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
FakeVD/code_test/models/trm.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.init as init
|
6 |
+
|
7 |
+
|
8 |
+
class Linear(nn.Module):
|
9 |
+
def __init__(self, in_features, out_features, bias=True):
|
10 |
+
super(Linear, self).__init__()
|
11 |
+
self.linear = nn.Linear(in_features, out_features, bias=bias)
|
12 |
+
init.xavier_normal_(self.linear.weight)
|
13 |
+
init.zeros_(self.linear.bias)
|
14 |
+
|
15 |
+
def forward(self, inputs):
|
16 |
+
return self.linear(inputs)
|
17 |
+
|
18 |
+
|
19 |
+
class ScaledDotProductAttention(nn.Module):
|
20 |
+
def __init__(self, d_k, dropout=.1):
|
21 |
+
super(ScaledDotProductAttention, self).__init__()
|
22 |
+
self.scale_factor = np.sqrt(d_k)
|
23 |
+
self.softmax = nn.Softmax(dim=-1)
|
24 |
+
self.dropout = nn.Dropout(dropout)
|
25 |
+
|
26 |
+
def forward(self, q, k, v, attn_mask=None):
|
27 |
+
# q: [b_size x n_heads x len_q x d_k]
|
28 |
+
# k: [b_size x n_heads x len_k x d_k]
|
29 |
+
# v: [b_size x n_heads x len_v x d_v] note: (len_k == len_v)
|
30 |
+
|
31 |
+
# attn: [b_size x n_heads x len_q x len_k]
|
32 |
+
scores = torch.matmul(q, k.transpose(-1, -2)) / self.scale_factor
|
33 |
+
if attn_mask is not None:
|
34 |
+
assert attn_mask.size() == scores.size()
|
35 |
+
scores.masked_fill_(attn_mask, -1e9)
|
36 |
+
attn = self.dropout(self.softmax(scores))
|
37 |
+
|
38 |
+
# outputs: [b_size x n_heads x len_q x d_v]
|
39 |
+
context = torch.matmul(attn, v)
|
40 |
+
|
41 |
+
return context, attn
|
42 |
+
|
43 |
+
|
44 |
+
class LayerNormalization(nn.Module):
|
45 |
+
def __init__(self, d_hid, eps=1e-6):
|
46 |
+
super(LayerNormalization, self).__init__()
|
47 |
+
self.gamma = nn.Parameter(torch.ones(d_hid))
|
48 |
+
self.beta = nn.Parameter(torch.zeros(d_hid))
|
49 |
+
self.eps = eps
|
50 |
+
|
51 |
+
def forward(self, z):
|
52 |
+
mean = z.mean(dim=-1, keepdim=True,)
|
53 |
+
std = z.std(dim=-1, keepdim=True,)
|
54 |
+
ln_out = (z - mean) / (std + self.eps)
|
55 |
+
ln_out = self.gamma * ln_out + self.beta
|
56 |
+
|
57 |
+
return ln_out
|
58 |
+
|
59 |
+
|
60 |
+
class PosEncoding(nn.Module):
|
61 |
+
def __init__(self, max_seq_len, d_word_vec):
|
62 |
+
super(PosEncoding, self).__init__()
|
63 |
+
pos_enc = np.array(
|
64 |
+
[[pos / np.power(10000, 2.0 * (j // 2) / d_word_vec) for j in range(d_word_vec)]
|
65 |
+
for pos in range(max_seq_len)])
|
66 |
+
pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2])
|
67 |
+
pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2])
|
68 |
+
pad_row = np.zeros([1, d_word_vec])
|
69 |
+
pos_enc = np.concatenate([pad_row, pos_enc]).astype(np.float32)
|
70 |
+
|
71 |
+
# additional single row for PAD idx
|
72 |
+
self.pos_enc = nn.Embedding(max_seq_len + 1, d_word_vec)
|
73 |
+
# fix positional encoding: exclude weight from grad computation
|
74 |
+
self.pos_enc.weight = nn.Parameter(torch.from_numpy(pos_enc), requires_grad=False)
|
75 |
+
self.max_len = int(max_seq_len/10)
|
76 |
+
def forward(self, input_len):
|
77 |
+
max_len = self.max_len # torch.max(input_len)
|
78 |
+
tensor = torch.cuda.LongTensor if input_len.is_cuda else torch.LongTensor
|
79 |
+
input_pos = tensor([list(range(1, len+1)) + [0]*(max_len-len) for len in input_len])
|
80 |
+
return self.pos_enc(input_pos)
|
FakeVD/code_test/predict.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch.utils.data import DataLoader
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from tqdm import tqdm
|
8 |
+
from FakeVD.code_test.utils.metrics import *
|
9 |
+
|
10 |
+
from FakeVD.code_test.models.SVFEND import SVFENDModel
|
11 |
+
from FakeVD.code_test.utils.dataloader import SVFENDDataset
|
12 |
+
from FakeVD.code_test.run import _init_fn, SVFEND_collate_fn
|
13 |
+
|
14 |
+
# from VGGish_Feature_Extractor.my_vggish_folder_fun import vggish_audio
|
15 |
+
from FakeVD.code_test.VGGish_Feature_Extractor.my_vggish_fun import vggish_audio, load_model_vggish
|
16 |
+
from FakeVD.code_test.VGG19_Feature_Extractor.vgg19_feature import process_video as vgg19_frame
|
17 |
+
from FakeVD.code_test.VGG19_Feature_Extractor.vgg19_feature import load_model_vgg19
|
18 |
+
from FakeVD.code_test.C3D_Feature_Extractor.feature_extractor_vid import feature_extractor as c3d_video
|
19 |
+
from FakeVD.code_test.C3D_Feature_Extractor.feature_extractor_vid import load_model_c3d
|
20 |
+
from FakeVD.code_test.Text_Feature_Extractor.main import video_work as asr_text
|
21 |
+
from FakeVD.code_test.Text_Feature_Extractor.wav2text import wav2text
|
22 |
+
|
23 |
+
def load_model(checkpoint_path):
|
24 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
+
model = SVFENDModel(bert_model='bert-base-chinese', fea_dim=128,dropout=0.1)
|
26 |
+
# model.load_state_dict(torch.load(checkpoint_path))
|
27 |
+
model.load_state_dict(torch.load(checkpoint_path, map_location=device), False)
|
28 |
+
model.eval()
|
29 |
+
return model
|
30 |
+
|
31 |
+
def get_model(checkpoint_path='./FakeVD/code_test/checkpoints/SVFEND/SVFEND/_test_epoch4_0.7943'):
|
32 |
+
# 加载检测模型 模型存放路径 checkpoint_path
|
33 |
+
model_main = load_model(checkpoint_path)
|
34 |
+
model_vggish = load_model_vggish()
|
35 |
+
model_vgg19 = load_model_vgg19()
|
36 |
+
model_c3d = load_model_c3d()
|
37 |
+
model_text = wav2text()
|
38 |
+
|
39 |
+
models = {
|
40 |
+
'model_main': model_main,
|
41 |
+
'model_vggish': model_vggish,
|
42 |
+
'model_vgg19': model_vgg19,
|
43 |
+
'model_c3d' : model_c3d,
|
44 |
+
'model_text' : model_text
|
45 |
+
}
|
46 |
+
|
47 |
+
return models
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
# label = 0 if item['annotation']=='真' else 1
|
52 |
+
def test(model, dataloader):
|
53 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
54 |
+
model.to(device)
|
55 |
+
# model.cuda()
|
56 |
+
model.eval()
|
57 |
+
|
58 |
+
pred = []
|
59 |
+
label = []
|
60 |
+
prob = []
|
61 |
+
|
62 |
+
for batch in tqdm(dataloader):
|
63 |
+
with torch.no_grad():
|
64 |
+
batch_data = batch
|
65 |
+
for k, v in batch_data.items():
|
66 |
+
batch_data[k] = v.to(device)
|
67 |
+
batch_label = batch_data['label']
|
68 |
+
|
69 |
+
batch_outputs, fea = model(**batch_data)
|
70 |
+
|
71 |
+
_, batch_preds = torch.max(batch_outputs, 1)
|
72 |
+
|
73 |
+
softmax_probs = F.softmax(batch_outputs, dim=1) # 计算softmax概率
|
74 |
+
|
75 |
+
label.extend(batch_label.detach().cpu().numpy().tolist())
|
76 |
+
pred.extend(batch_preds.detach().cpu().numpy().tolist())
|
77 |
+
prob.extend(softmax_probs.detach().cpu().numpy().tolist()) # 收集softmax概率
|
78 |
+
|
79 |
+
return (label, pred, prob)
|
80 |
+
|
81 |
+
def main(models,
|
82 |
+
video_file_path,
|
83 |
+
preprocessed_flag=False,
|
84 |
+
feature_path='./FakeVD/code_test/preprocessed_feature'):
|
85 |
+
# 视频是否已经过预处理 preprocessed_flag
|
86 |
+
# 特征存放目录 feature_path
|
87 |
+
|
88 |
+
# 获取模型
|
89 |
+
model_main = models['model_main']
|
90 |
+
model_vggish = models['model_vggish']
|
91 |
+
model_vgg19 = models['model_vgg19']
|
92 |
+
model_c3d = models['model_c3d']
|
93 |
+
model_text = models['model_text']
|
94 |
+
|
95 |
+
# 获取视频文件夹路径
|
96 |
+
video_folder_path = os.path.dirname(video_file_path)
|
97 |
+
|
98 |
+
# 获取视频文件名(包含扩展名)
|
99 |
+
video_file_name = os.path.basename(video_file_path)
|
100 |
+
|
101 |
+
# 提取视频文件名(不包括扩展名)作为视频ID
|
102 |
+
vids = []
|
103 |
+
vid = os.path.splitext(video_file_name)[0]
|
104 |
+
vids.append(vid)
|
105 |
+
# video_file_name = os.path.basename(video_file_path)
|
106 |
+
# vids.append(os.path.splitext(video_file_name)[0])
|
107 |
+
# # vids.append(video_file_name.split('_')[1].split('.')[0]
|
108 |
+
|
109 |
+
# VGGish_audio特征目录
|
110 |
+
VGGish_audio_feature_path = os.path.join(feature_path, vid+'.pkl')
|
111 |
+
# C3D_video特征目录
|
112 |
+
C3D_video_feature_path = os.path.join(feature_path, 'C3D/')
|
113 |
+
# VGG19_frame特征目录
|
114 |
+
VGG19_frame_feature_path = os.path.join(feature_path, 'VGG19/')
|
115 |
+
# ASR_text特征目录
|
116 |
+
asr_text_feature_path = os.path.join(feature_path, 'ASR/'+vid+'.json')
|
117 |
+
|
118 |
+
# 特征提取
|
119 |
+
if not preprocessed_flag:
|
120 |
+
vggish_audio(model_vggish, video_file_path, VGGish_audio_feature_path)
|
121 |
+
vgg19_frame(model_vgg19, video_file_name, video_folder_path, VGG19_frame_feature_path)
|
122 |
+
c3d_video(model_c3d, C3D_video_feature_path, video_folder_path, video_file_name)
|
123 |
+
asr_text(model_text, model_vggish, video_file_path, asr_text_feature_path)
|
124 |
+
|
125 |
+
# 数据路径
|
126 |
+
data = vids
|
127 |
+
data_paths = {
|
128 |
+
'VGGish_audio' : VGGish_audio_feature_path,
|
129 |
+
'C3D_video' : C3D_video_feature_path,
|
130 |
+
'VGG19_frame' : VGG19_frame_feature_path,
|
131 |
+
'ASR_text' : asr_text_feature_path
|
132 |
+
}
|
133 |
+
|
134 |
+
# 创建Dataset和DataLoader
|
135 |
+
dataset = SVFENDDataset(data, data_paths)
|
136 |
+
|
137 |
+
dataloader=DataLoader(dataset, batch_size=1,
|
138 |
+
num_workers=0,
|
139 |
+
pin_memory=True,
|
140 |
+
shuffle=False,
|
141 |
+
worker_init_fn=_init_fn,
|
142 |
+
collate_fn=SVFEND_collate_fn)
|
143 |
+
|
144 |
+
# 进行预测
|
145 |
+
predictions = test(model_main, dataloader)
|
146 |
+
annotation = '真' if predictions[1][0]==0 else '假'
|
147 |
+
prob_softmax = predictions[2]
|
148 |
+
# annotation_prob = max(prob_softmax[0])
|
149 |
+
annotation_prob = prob_softmax[0][0]#真的概率
|
150 |
+
annotation_prob1 = prob_softmax[0][1]#假的概率
|
151 |
+
# 打印预测结果
|
152 |
+
print(annotation, annotation_prob, annotation_prob1)
|
153 |
+
|
154 |
+
return annotation_prob1
|
155 |
+
|
156 |
+
|
157 |
+
if __name__ == "__main__":
|
158 |
+
# 视频是否已经过预处理
|
159 |
+
preprocessed_flag = False
|
160 |
+
video_file_path = "./FakeVD/dataset/videos_1/douyin_6700861687563570439.mp4"
|
161 |
+
models = get_model()
|
162 |
+
main(models, video_file_path, preprocessed_flag)
|
FakeVD/code_test/run.py
ADDED
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import collections
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from torch.utils.data import DataLoader
|
9 |
+
from torch.utils.tensorboard import SummaryWriter
|
10 |
+
# from gensim.models import KeyedVectors
|
11 |
+
|
12 |
+
from FakeVD.code_test.models.Baselines import *
|
13 |
+
from FakeVD.code_test.models.FANVM import FANVMModel
|
14 |
+
from FakeVD.code_test.models.SVFEND import SVFENDModel
|
15 |
+
from FakeVD.code_test.models.TikTec import TikTecModel
|
16 |
+
|
17 |
+
from FakeVD.code_test.utils.dataloader import *
|
18 |
+
from FakeVD.code_test.models.Trainer import Trainer
|
19 |
+
from FakeVD.code_test.models.Trainer_3set import Trainer3
|
20 |
+
|
21 |
+
|
22 |
+
def pad_sequence(seq_len,lst, emb):
|
23 |
+
result=[]
|
24 |
+
for video in lst:
|
25 |
+
if isinstance(video, list):
|
26 |
+
video = torch.stack(video)
|
27 |
+
ori_len=video.shape[0]
|
28 |
+
if ori_len == 0:
|
29 |
+
video = torch.zeros([seq_len,emb],dtype=torch.long)
|
30 |
+
elif ori_len>=seq_len:
|
31 |
+
if emb == 200:
|
32 |
+
video=torch.FloatTensor(video[:seq_len])
|
33 |
+
else:
|
34 |
+
video=torch.LongTensor(video[:seq_len])
|
35 |
+
else:
|
36 |
+
video=torch.cat([video,torch.zeros([seq_len-ori_len,video.shape[1]],dtype=torch.long)],dim=0)
|
37 |
+
if emb == 200:
|
38 |
+
video=torch.FloatTensor(video)
|
39 |
+
else:
|
40 |
+
video=torch.LongTensor(video)
|
41 |
+
result.append(video)
|
42 |
+
return torch.stack(result)
|
43 |
+
|
44 |
+
def pad_sequence_bbox(seq_len,lst):
|
45 |
+
result=[]
|
46 |
+
for video in lst:
|
47 |
+
if isinstance(video, list):
|
48 |
+
video = torch.stack(video)
|
49 |
+
ori_len=video.shape[0]
|
50 |
+
if ori_len == 0:
|
51 |
+
video = torch.zeros([seq_len,45,4096],dtype=torch.float)
|
52 |
+
elif ori_len>=seq_len:
|
53 |
+
video=torch.FloatTensor(video[:seq_len])
|
54 |
+
else:
|
55 |
+
video=torch.cat([video,torch.zeros([seq_len-ori_len,45,4096],dtype=torch.float)],dim=0)
|
56 |
+
result.append(video)
|
57 |
+
return torch.stack(result)
|
58 |
+
|
59 |
+
def pad_frame_sequence(seq_len,lst):
|
60 |
+
attention_masks = []
|
61 |
+
result=[]
|
62 |
+
for video in lst:
|
63 |
+
video=torch.FloatTensor(video)
|
64 |
+
ori_len=video.shape[0]
|
65 |
+
if ori_len>=seq_len:
|
66 |
+
gap=ori_len//seq_len
|
67 |
+
video=video[::gap][:seq_len]
|
68 |
+
mask = np.ones((seq_len))
|
69 |
+
else:
|
70 |
+
video=torch.cat((video,torch.zeros([seq_len-ori_len,video.shape[1]],dtype=torch.float)),dim=0)
|
71 |
+
mask = np.append(np.ones(ori_len), np.zeros(seq_len-ori_len))
|
72 |
+
result.append(video)
|
73 |
+
mask = torch.IntTensor(mask)
|
74 |
+
attention_masks.append(mask)
|
75 |
+
return torch.stack(result), torch.stack(attention_masks)
|
76 |
+
|
77 |
+
|
78 |
+
def _init_fn(worker_id):
|
79 |
+
np.random.seed(2022)
|
80 |
+
|
81 |
+
def SVFEND_collate_fn(batch):
|
82 |
+
num_frames = 83
|
83 |
+
num_audioframes = 50
|
84 |
+
|
85 |
+
title_inputid = [item['title_inputid'] for item in batch]
|
86 |
+
title_mask = [item['title_mask'] for item in batch]
|
87 |
+
|
88 |
+
frames = [item['frames'] for item in batch]
|
89 |
+
frames, frames_masks = pad_frame_sequence(num_frames, frames)
|
90 |
+
|
91 |
+
audioframes = [item['audioframes'] for item in batch]
|
92 |
+
audioframes, audioframes_masks = pad_frame_sequence(num_audioframes, audioframes)
|
93 |
+
|
94 |
+
c3d = [item['c3d'] for item in batch]
|
95 |
+
c3d, c3d_masks = pad_frame_sequence(num_frames, c3d)
|
96 |
+
|
97 |
+
label = [item['label'] for item in batch]
|
98 |
+
|
99 |
+
return {
|
100 |
+
'label': torch.stack(label),
|
101 |
+
'title_inputid': torch.stack(title_inputid),
|
102 |
+
'title_mask': torch.stack(title_mask),
|
103 |
+
'audioframes': audioframes,
|
104 |
+
'audioframes_masks': audioframes_masks,
|
105 |
+
'frames':frames,
|
106 |
+
'frames_masks': frames_masks,
|
107 |
+
'c3d': c3d,
|
108 |
+
'c3d_masks': c3d_masks,
|
109 |
+
}
|
110 |
+
|
111 |
+
def FANVM_collate_fn(batch):
|
112 |
+
num_comments = 23
|
113 |
+
num_frames = 83
|
114 |
+
|
115 |
+
title_inputid = [item['title_inputid'] for item in batch]
|
116 |
+
title_mask = [item['title_mask'] for item in batch]
|
117 |
+
|
118 |
+
comments_like = [item['comments_like'] for item in batch]
|
119 |
+
comments_inputid = [item['comments_inputid'] for item in batch]
|
120 |
+
comments_mask = [item['comments_mask'] for item in batch]
|
121 |
+
|
122 |
+
comments_inputid_resorted = []
|
123 |
+
comments_mask_resorted = []
|
124 |
+
comments_like_resorted = []
|
125 |
+
|
126 |
+
for idx in range(len(comments_like)):
|
127 |
+
comments_like_one = comments_like[idx]
|
128 |
+
comments_inputid_one = comments_inputid[idx]
|
129 |
+
comments_mask_one = comments_mask[idx]
|
130 |
+
if comments_like_one.shape != torch.Size([0]):
|
131 |
+
comments_inputid_one, comments_mask_one, comments_like_one = (list(t) for t in zip(*sorted(zip(comments_inputid_one, comments_mask_one, comments_like_one), key=lambda s: s[2], reverse=True)))
|
132 |
+
comments_inputid_resorted.append(comments_inputid_one)
|
133 |
+
comments_mask_resorted.append(comments_mask_one)
|
134 |
+
comments_like_resorted.append(comments_like_one)
|
135 |
+
|
136 |
+
comments_inputid = pad_sequence(num_comments,comments_inputid_resorted,250)
|
137 |
+
comments_mask = pad_sequence(num_comments,comments_mask_resorted,250)
|
138 |
+
comments_like=[]
|
139 |
+
for idx in range(len(comments_like_resorted)):
|
140 |
+
comments_like_resorted_one = comments_like_resorted[idx]
|
141 |
+
if len(comments_like_resorted_one)>=num_comments:
|
142 |
+
comments_like.append(torch.tensor(comments_like_resorted_one[:num_comments]))
|
143 |
+
else:
|
144 |
+
if isinstance(comments_like_resorted_one, list):
|
145 |
+
comments_like.append(torch.tensor(comments_like_resorted_one+[0]*(num_comments-len(comments_like_resorted_one))))
|
146 |
+
else:
|
147 |
+
comments_like.append(torch.tensor(comments_like_resorted_one.tolist()+[0]*(num_comments-len(comments_like_resorted_one))))
|
148 |
+
|
149 |
+
frames = [item['frames'] for item in batch]
|
150 |
+
frames, frames_masks = pad_frame_sequence(num_frames, frames)
|
151 |
+
frame_thmub = [item['frame_thmub'] for item in batch]
|
152 |
+
|
153 |
+
label = [item['label'] for item in batch]
|
154 |
+
label_event = [item['label_event'] for item in batch]
|
155 |
+
s = [item['s'] for item in batch]
|
156 |
+
|
157 |
+
return {
|
158 |
+
'label': torch.stack(label),
|
159 |
+
'title_inputid': torch.stack(title_inputid),
|
160 |
+
'title_mask': torch.stack(title_mask),
|
161 |
+
'comments_inputid': comments_inputid,
|
162 |
+
'comments_mask': comments_mask,
|
163 |
+
'comments_like': torch.stack(comments_like),
|
164 |
+
'frames':frames,
|
165 |
+
'frames_masks': frames_masks,
|
166 |
+
'frame_thmub': torch.stack(frame_thmub),
|
167 |
+
's': torch.stack(s),
|
168 |
+
'label_event':torch.stack(label_event),
|
169 |
+
}
|
170 |
+
|
171 |
+
def bbox_collate_fn(batch):
|
172 |
+
num_frames = 83
|
173 |
+
|
174 |
+
bbox_vgg = [item['bbox_vgg'] for item in batch]
|
175 |
+
bbox_vgg = pad_sequence_bbox(num_frames,bbox_vgg)
|
176 |
+
|
177 |
+
label = [item['label'] for item in batch]
|
178 |
+
|
179 |
+
return {
|
180 |
+
'label': torch.stack(label),
|
181 |
+
'bbox_vgg': bbox_vgg,
|
182 |
+
}
|
183 |
+
|
184 |
+
def c3d_collate_fn(batch):
|
185 |
+
num_frames = 83
|
186 |
+
|
187 |
+
c3d = [item['c3d'] for item in batch]
|
188 |
+
c3d, c3d_masks = pad_frame_sequence(num_frames, c3d)
|
189 |
+
|
190 |
+
label = [item['label'] for item in batch]
|
191 |
+
|
192 |
+
return {
|
193 |
+
'label': torch.stack(label),
|
194 |
+
'c3d': c3d,
|
195 |
+
'c3d_masks': c3d_masks,
|
196 |
+
}
|
197 |
+
|
198 |
+
def vgg_collate_fn(batch):
|
199 |
+
num_frames = 83
|
200 |
+
|
201 |
+
frames = [item['frames'] for item in batch]
|
202 |
+
frames, frames_masks = pad_frame_sequence(num_frames, frames)
|
203 |
+
|
204 |
+
label = [item['label'] for item in batch]
|
205 |
+
|
206 |
+
return {
|
207 |
+
'label': torch.stack(label),
|
208 |
+
'frames':frames,
|
209 |
+
'frames_masks': frames_masks,
|
210 |
+
}
|
211 |
+
|
212 |
+
def comments_collate_fn(batch):
|
213 |
+
num_comments = 23
|
214 |
+
|
215 |
+
comments_like = [item['comments_like'] for item in batch]
|
216 |
+
comments_inputid = [item['comments_inputid'] for item in batch]
|
217 |
+
comments_mask = [item['comments_mask'] for item in batch]
|
218 |
+
|
219 |
+
comments_inputid_resorted = []
|
220 |
+
comments_mask_resorted = []
|
221 |
+
comments_like_resorted = []
|
222 |
+
|
223 |
+
for idx in range(len(comments_like)):
|
224 |
+
comments_like_one = comments_like[idx]
|
225 |
+
comments_inputid_one = comments_inputid[idx]
|
226 |
+
comments_mask_one = comments_mask[idx]
|
227 |
+
if comments_like_one.shape != torch.Size([0]):
|
228 |
+
comments_inputid_one, comments_mask_one, comments_like_one = (list(t) for t in zip(*sorted(zip(comments_inputid_one, comments_mask_one, comments_like_one), key=lambda s: s[2], reverse=True)))
|
229 |
+
comments_inputid_resorted.append(comments_inputid_one)
|
230 |
+
comments_mask_resorted.append(comments_mask_one)
|
231 |
+
comments_like_resorted.append(comments_like_one)
|
232 |
+
|
233 |
+
comments_inputid = pad_sequence(num_comments,comments_inputid_resorted,250)
|
234 |
+
comments_mask = pad_sequence(num_comments,comments_mask_resorted,250)
|
235 |
+
comments_like=[]
|
236 |
+
for idx in range(len(comments_like_resorted)):
|
237 |
+
comments_like_resorted_one = comments_like_resorted[idx]
|
238 |
+
if len(comments_like_resorted_one)>=num_comments:
|
239 |
+
comments_like.append(torch.tensor(comments_like_resorted_one[:num_comments]))
|
240 |
+
else:
|
241 |
+
if isinstance(comments_like_resorted_one, list):
|
242 |
+
comments_like.append(torch.tensor(comments_like_resorted_one+[0]*(num_comments-len(comments_like_resorted_one))))
|
243 |
+
else:
|
244 |
+
comments_like.append(torch.tensor(comments_like_resorted_one.tolist()+[0]*(num_comments-len(comments_like_resorted_one))))
|
245 |
+
|
246 |
+
label = [item['label'] for item in batch]
|
247 |
+
|
248 |
+
return {
|
249 |
+
'label': torch.stack(label),
|
250 |
+
'comments_inputid': comments_inputid,
|
251 |
+
'comments_mask': comments_mask,
|
252 |
+
'comments_like': torch.stack(comments_like),
|
253 |
+
}
|
254 |
+
|
255 |
+
def title_w2v_collate_fn(batch):
|
256 |
+
length_title = 128
|
257 |
+
title_w2v = [item['title_w2v'] for item in batch]
|
258 |
+
title_w2v = pad_sequence(length_title, title_w2v, 100)
|
259 |
+
|
260 |
+
label = [item['label'] for item in batch]
|
261 |
+
|
262 |
+
return {
|
263 |
+
'label': torch.stack(label),
|
264 |
+
'title_w2v': title_w2v,
|
265 |
+
}
|
266 |
+
|
267 |
+
def tictec_collate_fn(batch):
|
268 |
+
"""
|
269 |
+
将一批样本组合成一个批次。
|
270 |
+
|
271 |
+
Args:
|
272 |
+
batch (list of dict): 包含单个样本的列表,每个样本是一个字典,包含 'label'、'caption_feature'、'visual_feature'、'asr_feature'、'mask_K' 和 'mask_N'。
|
273 |
+
|
274 |
+
Returns:
|
275 |
+
dict: 包含批次数据的字典,'labels' 是一个张量,其他特征和掩码也是张量。
|
276 |
+
"""
|
277 |
+
num_frames = 83
|
278 |
+
|
279 |
+
|
280 |
+
labels = torch.stack([item['label'] for item in batch])
|
281 |
+
caption_features = torch.stack([item['caption_feature'] for item in batch])
|
282 |
+
visual_features = torch.stack([item['visual_feature'] for item in batch])
|
283 |
+
asr_features = torch.stack([item['asr_feature'] for item in batch])
|
284 |
+
mask_Ks = torch.stack([item['mask_K'] for item in batch])
|
285 |
+
mask_Ns = torch.stack([item['mask_N'] for item in batch])
|
286 |
+
|
287 |
+
return {
|
288 |
+
'label': labels,
|
289 |
+
'caption_feature': caption_features,
|
290 |
+
'visual_feature': visual_features,
|
291 |
+
'asr_feature': asr_features,
|
292 |
+
'mask_K': mask_Ks,
|
293 |
+
'mask_N': mask_Ns,
|
294 |
+
}
|
295 |
+
|
296 |
+
|
297 |
+
class Run():
|
298 |
+
def __init__(self,
|
299 |
+
config
|
300 |
+
):
|
301 |
+
|
302 |
+
self.model_name = config['model_name']
|
303 |
+
self.mode_eval = config['mode_eval']
|
304 |
+
self.fold = config['fold']
|
305 |
+
self.data_type = 'SVFEND'
|
306 |
+
|
307 |
+
self.epoches = config['epoches']
|
308 |
+
self.batch_size = config['batch_size']
|
309 |
+
self.num_workers = config['num_workers']
|
310 |
+
self.epoch_stop = config['epoch_stop']
|
311 |
+
self.seed = config['seed']
|
312 |
+
self.device = config['device']
|
313 |
+
self.lr = config['lr']
|
314 |
+
self.lambd=config['lambd']
|
315 |
+
self.save_param_dir = config['path_param']
|
316 |
+
self.path_tensorboard = config['path_tensorboard']
|
317 |
+
self.dropout = config['dropout']
|
318 |
+
self.weight_decay = config['weight_decay']
|
319 |
+
self.event_num = 616
|
320 |
+
self.mode ='normal'
|
321 |
+
|
322 |
+
|
323 |
+
def get_dataloader(self,data_type,data_fold):
|
324 |
+
collate_fn=None
|
325 |
+
|
326 |
+
if data_type=='SVFEND':
|
327 |
+
dataset_train = SVFENDDataset(f'vid_fold_{1}.txt')
|
328 |
+
dataset_test = SVFENDDataset(f'vid_fold_{2}.txt')
|
329 |
+
collate_fn=SVFEND_collate_fn
|
330 |
+
elif data_type=='FANVM':
|
331 |
+
dataset_train = FANVMDataset_train(f'vid_fold_no_{data_fold}.txt')
|
332 |
+
dataset_test = FANVMDataset_test(path_vid_train=f'vid_fold_no_{data_fold}.txt', path_vid_test=f'vid_fold_{data_fold}.txt')
|
333 |
+
collate_fn = FANVM_collate_fn
|
334 |
+
elif data_type=='c3d':
|
335 |
+
dataset_train = C3DDataset(f'vid_fold_no_{data_fold}.txt')
|
336 |
+
dataset_test = C3DDataset(f'vid_fold_{data_fold}.txt')
|
337 |
+
collate_fn = c3d_collate_fn
|
338 |
+
elif data_type=='vgg':
|
339 |
+
dataset_train = VGGDataset(f'vid_fold_no_{data_fold}.txt')
|
340 |
+
dataset_test = VGGDataset(f'vid_fold_{data_fold}.txt')
|
341 |
+
collate_fn = vgg_collate_fn
|
342 |
+
elif data_type=='bbox':
|
343 |
+
dataset_train = BboxDataset('vid_fold_no1.txt')
|
344 |
+
dataset_test = BboxDataset('vid_fold_1.txt')
|
345 |
+
collate_fn = bbox_collate_fn
|
346 |
+
elif data_type=='comments':
|
347 |
+
dataset_train = CommentsDataset(f'vid_fold_no_{data_fold}.txt')
|
348 |
+
dataset_test = CommentsDataset(f'vid_fold_{data_fold}.txt')
|
349 |
+
collate_fn = comments_collate_fn
|
350 |
+
elif data_type=='TikTec':
|
351 |
+
dataset_train = TikTecDataset(f'vid_fold_no_{data_fold}.txt')
|
352 |
+
dataset_test = TikTecDataset(f'vid_fold_{data_fold}.txt')
|
353 |
+
collate_fn = tictec_collate_fn
|
354 |
+
# elif data_type=='w2v':
|
355 |
+
# wv_from_text = KeyedVectors.load_word2vec_format("./stores/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt", binary=False)
|
356 |
+
# dataset_train = Title_W2V_Dataset(f'vid_fold_no{data_fold}.txt', wv_from_text)
|
357 |
+
# dataset_test = Title_W2V_Dataset(f'vid_fold_{data_fold}.txt', wv_from_text)
|
358 |
+
# collate_fn = title_w2v_collate_fn
|
359 |
+
|
360 |
+
train_dataloader = DataLoader(dataset_train, batch_size=self.batch_size,
|
361 |
+
num_workers=self.num_workers,
|
362 |
+
pin_memory=True,
|
363 |
+
shuffle=True,
|
364 |
+
worker_init_fn=_init_fn,
|
365 |
+
collate_fn=collate_fn)
|
366 |
+
|
367 |
+
test_dataloader=DataLoader(dataset_test, batch_size=self.batch_size,
|
368 |
+
num_workers=self.num_workers,
|
369 |
+
pin_memory=True,
|
370 |
+
shuffle=False,
|
371 |
+
worker_init_fn=_init_fn,
|
372 |
+
collate_fn=collate_fn)
|
373 |
+
|
374 |
+
dataloaders = dict(zip(['train', 'test'],[train_dataloader, test_dataloader]))
|
375 |
+
|
376 |
+
return dataloaders
|
377 |
+
|
378 |
+
|
379 |
+
def get_dataloader_temporal(self, data_type):
|
380 |
+
collate_fn=None
|
381 |
+
if data_type=='SVFEND':
|
382 |
+
dataset_train = SVFENDDataset('vid_time3_train.txt')
|
383 |
+
dataset_val = SVFENDDataset('vid_time3_val.txt')
|
384 |
+
dataset_test = SVFENDDataset('vid_time3_test.txt')
|
385 |
+
collate_fn=SVFEND_collate_fn
|
386 |
+
elif data_type=='FANVM':
|
387 |
+
dataset_train = FANVMDataset_train('vid_time3_train.txt')
|
388 |
+
dataset_val = FANVMDataset_test(path_vid_train='vid_time3_train.txt', path_vid_test='vid_time3_valid.txt')
|
389 |
+
dataset_test = FANVMDataset_test(path_vid_train='vid_time3_train.txt', path_vid_test='vid_time3_test.txt')
|
390 |
+
collate_fn = FANVM_collate_fn
|
391 |
+
else:
|
392 |
+
# can be added
|
393 |
+
print ("Not available")
|
394 |
+
|
395 |
+
train_dataloader = DataLoader(dataset_train, batch_size=self.batch_size,
|
396 |
+
num_workers=self.num_workers,
|
397 |
+
pin_memory=True,
|
398 |
+
shuffle=True,
|
399 |
+
worker_init_fn=_init_fn,
|
400 |
+
collate_fn=collate_fn)
|
401 |
+
val_dataloader = DataLoader(dataset_val, batch_size=self.batch_size,
|
402 |
+
num_workers=self.num_workers,
|
403 |
+
pin_memory=True,
|
404 |
+
shuffle=False,
|
405 |
+
worker_init_fn=_init_fn,
|
406 |
+
collate_fn=collate_fn)
|
407 |
+
test_dataloader=DataLoader(dataset_test, batch_size=self.batch_size,
|
408 |
+
num_workers=self.num_workers,
|
409 |
+
pin_memory=True,
|
410 |
+
shuffle=False,
|
411 |
+
worker_init_fn=_init_fn,
|
412 |
+
collate_fn=collate_fn)
|
413 |
+
|
414 |
+
dataloaders = dict(zip(['train', 'val', 'test'],[train_dataloader, val_dataloader, test_dataloader]))
|
415 |
+
|
416 |
+
return dataloaders
|
417 |
+
|
418 |
+
|
419 |
+
def get_model(self):
|
420 |
+
if self.model_name == 'SVFEND':
|
421 |
+
self.model = SVFENDModel(bert_model='bert-base-chinese', fea_dim=128,dropout=self.dropout)
|
422 |
+
elif self.model_name == 'FANVM':
|
423 |
+
self.model = FANVMModel(bert_model='bert-base-chinese', fea_dim=128)
|
424 |
+
self.data_type = "FANVM"
|
425 |
+
self.mode = 'eann'
|
426 |
+
elif self.model_name == 'C3D':
|
427 |
+
self.model = bC3D(fea_dim=128)
|
428 |
+
self.data_type = "c3d"
|
429 |
+
elif self.model_name == 'VGG':
|
430 |
+
self.model = bVGG(fea_dim=128)
|
431 |
+
self.data_type = "vgg"
|
432 |
+
elif self.model_name == 'Bbox':
|
433 |
+
self.model = bBbox(fea_dim=128)
|
434 |
+
self.data_type = "bbox"
|
435 |
+
elif self.model_name == 'Vggish':
|
436 |
+
self.model = bVggish(fea_dim=128)
|
437 |
+
elif self.model_name == 'Bert':
|
438 |
+
self.model = bBert(bert_model='bert-base-chinese', fea_dim=128,dropout=self.dropout)
|
439 |
+
elif self.model_name == 'TextCNN':
|
440 |
+
self.model = bTextCNN(fea_dim=128, vocab_size=100)
|
441 |
+
self.data_type = "w2v"
|
442 |
+
elif self.model_name == 'Comments':
|
443 |
+
self.model = bComments(bert_model='bert-base-chinese', fea_dim=128)
|
444 |
+
self.data_type = "comments"
|
445 |
+
elif self.model_name == 'TikTec':
|
446 |
+
self.model = TikTecModel(VCIF_dropout=self.dropout, MLP_dropout=self.dropout)
|
447 |
+
self.data_type = 'TikTec'
|
448 |
+
|
449 |
+
return self.model
|
450 |
+
|
451 |
+
|
452 |
+
def main(self):
|
453 |
+
if self.mode_eval == "nocv":
|
454 |
+
self.model = self.get_model()
|
455 |
+
dataloaders = self.get_dataloader(data_type=self.data_type, data_fold=self.fold)
|
456 |
+
trainer = Trainer(model=self.model, device = self.device, lr = self.lr, dataloaders = dataloaders, epoches = self.epoches, dropout = self.dropout, weight_decay = self.weight_decay, mode = self.mode, model_name = self.model_name, event_num = self.event_num,
|
457 |
+
epoch_stop = self.epoch_stop, save_param_path = self.save_param_dir+self.data_type+"/"+self.model_name+"/", writer = SummaryWriter(self.path_tensorboard))
|
458 |
+
result=trainer.train()
|
459 |
+
for metric in ['acc', 'f1', 'precision', 'recall', 'auc']:
|
460 |
+
print ('%s : %.4f' % (metric, result[metric]))
|
461 |
+
|
462 |
+
elif self.mode_eval == "temporal":
|
463 |
+
self.model = self.get_model()
|
464 |
+
dataloaders = self.get_dataloader_temporal(data_type=self.data_type)
|
465 |
+
trainer = Trainer3(model=self.model, device = self.device, lr = self.lr, dataloaders = dataloaders, epoches = self.epoches, dropout = self.dropout, weight_decay = self.weight_decay, mode = self.mode, model_name = self.model_name, event_num = self.event_num,
|
466 |
+
epoch_stop = self.epoch_stop, save_param_path = self.save_param_dir+self.data_type+"/"+self.model_name+"/", writer = SummaryWriter(self.path_tensorboard))
|
467 |
+
result=trainer.train()
|
468 |
+
for metric in ['acc', 'f1', 'precision', 'recall', 'auc']:
|
469 |
+
print ('%s : %.4f' % (metric, result[metric]))
|
470 |
+
return result
|
471 |
+
|
472 |
+
elif self.mode_eval == "cv":
|
473 |
+
collate_fn=None
|
474 |
+
# if self.model_name == 'TextCNN':
|
475 |
+
# wv_from_text = KeyedVectors.load_word2vec_format("./stores/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt", binary=False)
|
476 |
+
|
477 |
+
history = collections.defaultdict(list)
|
478 |
+
for fold in range(1, 6):
|
479 |
+
print('-' * 50)
|
480 |
+
print ('fold %d:' % fold)
|
481 |
+
print('-' * 50)
|
482 |
+
self.model = self.get_model()
|
483 |
+
dataloaders = self.get_dataloader(data_type=self.data_type, data_fold=fold)
|
484 |
+
trainer = Trainer(model = self.model, device = self.device, lr = self.lr, dataloaders = dataloaders, epoches = self.epoches, dropout = self.dropout, weight_decay = self.weight_decay, mode = self.mode, model_name = self.model_name, event_num = self.event_num,
|
485 |
+
epoch_stop = self.epoch_stop, save_param_path = self.save_param_dir+self.data_type+"/"+self.model_name+"/", writer = SummaryWriter(self.path_tensorboard+"fold_"+str(fold)+"/"))
|
486 |
+
|
487 |
+
result = trainer.train()
|
488 |
+
|
489 |
+
history['auc'].append(result['auc'])
|
490 |
+
history['f1'].append(result['f1'])
|
491 |
+
history['recall'].append(result['recall'])
|
492 |
+
history['precision'].append(result['precision'])
|
493 |
+
history['acc'].append(result['acc'])
|
494 |
+
|
495 |
+
print ('results on 5-fold cross-validation: ')
|
496 |
+
for metric in ['acc', 'f1', 'precision', 'recall', 'auc']:
|
497 |
+
print ('%s : %.4f +/- %.4f' % (metric, np.mean(history[metric]), np.std(history[metric])))
|
498 |
+
|
499 |
+
else:
|
500 |
+
print ("Not Available")
|
FakeVD/code_test/test.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cPickle是python2系列用的,3系列已经不用了,直接用pickle就好了
|
2 |
+
import pickle
|
3 |
+
|
4 |
+
# 重点是rb和r的区别,rb是打开2进制文件,文本文件用r
|
5 |
+
f = open('/mnt/data10t/dazuoye/GROUP2024-GEN6/FakeSV/code_test/preprocessed_feature/douyin_6559701594739313923.pkl','rb')
|
6 |
+
data = pickle.load(f)
|
7 |
+
|
8 |
+
# 打印前五个键值对
|
9 |
+
for i, (key, value) in enumerate(data.items()):
|
10 |
+
if i >= 2:
|
11 |
+
break
|
12 |
+
print(f"Key: {key}")
|
13 |
+
print(f"Value: {value}\n")
|