Spaces:
Runtime error
Runtime error
File size: 5,722 Bytes
fc3814c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
from typing import List, Dict, Tuple
"""
Video Face Manipulation Detection Through Ensemble of CNNs
Image and Sound Processing Lab - Politecnico di Milano
Nicolò Bonettini
Edoardo Daniele Cannas
Sara Mandelli
Luca Bondi
Paolo Bestagini
"""
import numpy as np
import pandas as pd
available_datasets = [
'dfdc-35-5-10',
'ff-c23-720-140-140',
'ff-c23-720-140-140-5fpv',
'ff-c23-720-140-140-10fpv',
'ff-c23-720-140-140-15fpv',
'ff-c23-720-140-140-20fpv',
'ff-c23-720-140-140-25fpv',
'celebdf', # just for convenience, not used in the original paper
]
def load_df(dfdc_df_path: str, ffpp_df_path: str, dfdc_faces_dir: str, ffpp_faces_dir: str, dataset: str) -> (pd.DataFrame, str):
if dataset.startswith('dfdc'):
df = pd.read_pickle(dfdc_df_path)
root = dfdc_faces_dir
elif dataset.startswith('ff-'):
df = pd.read_pickle(ffpp_df_path)
root = ffpp_faces_dir
else:
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
return df, root
def get_split_df(df: pd.DataFrame, dataset: str, split: str) -> pd.DataFrame:
if dataset == 'dfdc-35-5-10':
if split == 'train':
split_df = df[df['folder'].isin(range(35))]
elif split == 'val':
split_df = df[df['folder'].isin(range(35, 40))]
elif split == 'test':
split_df = df[df['folder'].isin(range(40, 50))]
else:
raise NotImplementedError('Unknown split: {}'.format(split))
elif dataset.startswith('ff-c23-720-140-140'):
# Save random state
st0 = np.random.get_state()
# Set seed for this selection only
np.random.seed(41)
# Split on original videos
crf = dataset.split('-')[1]
random_youtube_videos = np.random.permutation(
df[(df['source'] == 'youtube') & (df['quality'] == crf)]['video'].unique())
train_orig = random_youtube_videos[:720]
val_orig = random_youtube_videos[720:720 + 140]
test_orig = random_youtube_videos[720 + 140:]
if split == 'train':
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
elif split == 'val':
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
elif split == 'test':
split_df = pd.concat((df[df['original'].isin(test_orig)], df[df['video'].isin(test_orig)]), axis=0)
else:
raise NotImplementedError('Unknown split: {}'.format(split))
if dataset.endswith('fpv'):
fpv = int(dataset.rsplit('-', 1)[1][:-3])
idxs = []
for video in split_df['video'].unique():
idxs.append(np.random.choice(split_df[split_df['video'] == video].index, fpv, replace=False))
idxs = np.concatenate(idxs)
split_df = split_df.loc[idxs]
# Restore random state
np.random.set_state(st0)
elif dataset == 'celebdf':
seed = 41
num_real_train = 600
# Save random state
st0 = np.random.get_state()
# Set seed for this selection only
np.random.seed(seed)
# Split on original videos
random_train_val_real_videos = np.random.permutation(
df[(df['label'] == False) & (df['test'] == False)]['video'].unique())
train_orig = random_train_val_real_videos[:num_real_train]
val_orig = random_train_val_real_videos[num_real_train:]
if split == 'train':
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
elif split == 'val':
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
elif split == 'test':
split_df = df[df['test'] == True]
else:
raise NotImplementedError('Unknown split: {}'.format(split))
# Restore random state
np.random.set_state(st0)
else:
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
return split_df
def make_splits(dfdc_df: str, ffpp_df: str, dfdc_dir: str, ffpp_dir: str, dbs: Dict[str, List[str]]) -> Dict[str, Dict[str, Tuple[pd.DataFrame, str]]]:
"""
Make split and return Dataframe and root
:param
dfdc_df: str, path to the DataFrame containing info on the faces extracted from the DFDC dataset with extract_faces.py
ffpp_df: str, path to the DataFrame containing info on the faces extracted from the FF++ dataset with extract_faces.py
dfdc_dir: str, path to the directory containing the faces extracted from the DFDC dataset with extract_faces.py
ffpp_dir: str, path to the directory containing the faces extracted from the FF++ dataset with extract_faces.py
dbs: {split_name:[split_dataset1,split_dataset2,...]}
Example:
{'train':['dfdc-35-5-15',],'val':['dfdc-35-5-15',]}
:return: split_dict: dictonary containing {split_name: ['train', 'val'], splitdb: List(pandas.DataFrame, str)}
Example:
{'train, 'dfdc-35-5-15': (dfdc_train_df, 'path/to/dir/of/DFDC/faces')}
"""
split_dict = {}
full_dfs = {}
for split_name, split_dbs in dbs.items():
split_dict[split_name] = dict()
for split_db in split_dbs:
if split_db not in full_dfs:
full_dfs[split_db] = load_df(dfdc_df, ffpp_df, dfdc_dir, ffpp_dir, split_db)
full_df, root = full_dfs[split_db]
split_df = get_split_df(df=full_df, dataset=split_db, split=split_name)
split_dict[split_name][split_db] = (split_df, root)
return split_dict
|