Mohamed Almukhtar
Duplicate from malmukhtar/ImageDetection
fc3814c
from typing import List, Dict, Tuple
"""
Video Face Manipulation Detection Through Ensemble of CNNs
Image and Sound Processing Lab - Politecnico di Milano
Nicolò Bonettini
Edoardo Daniele Cannas
Sara Mandelli
Luca Bondi
Paolo Bestagini
"""
import numpy as np
import pandas as pd
available_datasets = [
'dfdc-35-5-10',
'ff-c23-720-140-140',
'ff-c23-720-140-140-5fpv',
'ff-c23-720-140-140-10fpv',
'ff-c23-720-140-140-15fpv',
'ff-c23-720-140-140-20fpv',
'ff-c23-720-140-140-25fpv',
'celebdf', # just for convenience, not used in the original paper
]
def load_df(dfdc_df_path: str, ffpp_df_path: str, dfdc_faces_dir: str, ffpp_faces_dir: str, dataset: str) -> (pd.DataFrame, str):
if dataset.startswith('dfdc'):
df = pd.read_pickle(dfdc_df_path)
root = dfdc_faces_dir
elif dataset.startswith('ff-'):
df = pd.read_pickle(ffpp_df_path)
root = ffpp_faces_dir
else:
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
return df, root
def get_split_df(df: pd.DataFrame, dataset: str, split: str) -> pd.DataFrame:
if dataset == 'dfdc-35-5-10':
if split == 'train':
split_df = df[df['folder'].isin(range(35))]
elif split == 'val':
split_df = df[df['folder'].isin(range(35, 40))]
elif split == 'test':
split_df = df[df['folder'].isin(range(40, 50))]
else:
raise NotImplementedError('Unknown split: {}'.format(split))
elif dataset.startswith('ff-c23-720-140-140'):
# Save random state
st0 = np.random.get_state()
# Set seed for this selection only
np.random.seed(41)
# Split on original videos
crf = dataset.split('-')[1]
random_youtube_videos = np.random.permutation(
df[(df['source'] == 'youtube') & (df['quality'] == crf)]['video'].unique())
train_orig = random_youtube_videos[:720]
val_orig = random_youtube_videos[720:720 + 140]
test_orig = random_youtube_videos[720 + 140:]
if split == 'train':
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
elif split == 'val':
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
elif split == 'test':
split_df = pd.concat((df[df['original'].isin(test_orig)], df[df['video'].isin(test_orig)]), axis=0)
else:
raise NotImplementedError('Unknown split: {}'.format(split))
if dataset.endswith('fpv'):
fpv = int(dataset.rsplit('-', 1)[1][:-3])
idxs = []
for video in split_df['video'].unique():
idxs.append(np.random.choice(split_df[split_df['video'] == video].index, fpv, replace=False))
idxs = np.concatenate(idxs)
split_df = split_df.loc[idxs]
# Restore random state
np.random.set_state(st0)
elif dataset == 'celebdf':
seed = 41
num_real_train = 600
# Save random state
st0 = np.random.get_state()
# Set seed for this selection only
np.random.seed(seed)
# Split on original videos
random_train_val_real_videos = np.random.permutation(
df[(df['label'] == False) & (df['test'] == False)]['video'].unique())
train_orig = random_train_val_real_videos[:num_real_train]
val_orig = random_train_val_real_videos[num_real_train:]
if split == 'train':
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
elif split == 'val':
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
elif split == 'test':
split_df = df[df['test'] == True]
else:
raise NotImplementedError('Unknown split: {}'.format(split))
# Restore random state
np.random.set_state(st0)
else:
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
return split_df
def make_splits(dfdc_df: str, ffpp_df: str, dfdc_dir: str, ffpp_dir: str, dbs: Dict[str, List[str]]) -> Dict[str, Dict[str, Tuple[pd.DataFrame, str]]]:
"""
Make split and return Dataframe and root
:param
dfdc_df: str, path to the DataFrame containing info on the faces extracted from the DFDC dataset with extract_faces.py
ffpp_df: str, path to the DataFrame containing info on the faces extracted from the FF++ dataset with extract_faces.py
dfdc_dir: str, path to the directory containing the faces extracted from the DFDC dataset with extract_faces.py
ffpp_dir: str, path to the directory containing the faces extracted from the FF++ dataset with extract_faces.py
dbs: {split_name:[split_dataset1,split_dataset2,...]}
Example:
{'train':['dfdc-35-5-15',],'val':['dfdc-35-5-15',]}
:return: split_dict: dictonary containing {split_name: ['train', 'val'], splitdb: List(pandas.DataFrame, str)}
Example:
{'train, 'dfdc-35-5-15': (dfdc_train_df, 'path/to/dir/of/DFDC/faces')}
"""
split_dict = {}
full_dfs = {}
for split_name, split_dbs in dbs.items():
split_dict[split_name] = dict()
for split_db in split_dbs:
if split_db not in full_dfs:
full_dfs[split_db] = load_df(dfdc_df, ffpp_df, dfdc_dir, ffpp_dir, split_db)
full_df, root = full_dfs[split_db]
split_df = get_split_df(df=full_df, dataset=split_db, split=split_name)
split_dict[split_name][split_db] = (split_df, root)
return split_dict