Spaces:
Runtime error
Runtime error
""" | |
Index the official Kaggle training dataset and prepares a train and validation set based on folders | |
Video Face Manipulation Detection Through Ensemble of CNNs | |
Image and Sound Processing Lab - Politecnico di Milano | |
Nicolò Bonettini | |
Edoardo Daniele Cannas | |
Sara Mandelli | |
Luca Bondi | |
Paolo Bestagini | |
""" | |
import sys | |
import argparse | |
from multiprocessing import Pool | |
from pathlib import Path | |
import numpy as np | |
import pandas as pd | |
from tqdm import tqdm | |
from isplutils.utils import extract_meta_av | |
def parse_args(argv): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--source', type=Path, help='Source dir', required=True) | |
parser.add_argument('--videodataset', type=Path, default='data/dfdc_videos.pkl', | |
help='Path to save the videos DataFrame') | |
parser.add_argument('--batch', type=int, help='Batch size', default=64) | |
return parser.parse_args(argv) | |
def main(argv): | |
## Parameters parsing | |
args = parse_args(argv) | |
source_dir: Path = args.source | |
videodataset_path: Path = args.videodataset | |
batch_size: int = args.batch | |
## DataFrame | |
if videodataset_path.exists(): | |
print('Loading video DataFrame') | |
df_videos = pd.read_pickle(videodataset_path) | |
else: | |
print('Creating video DataFrame') | |
# Create ouptut folder | |
videodataset_path.parent.mkdir(parents=True, exist_ok=True) | |
# Index | |
df_train_list = list() | |
for idx, json_path in enumerate(tqdm(sorted(source_dir.rglob('metadata.json')), desc='Indexing')): | |
df_tmp = pd.read_json(json_path, orient='index') | |
df_tmp['path'] = df_tmp.index.map( | |
lambda x: str(json_path.parent.relative_to(source_dir).joinpath(x))) | |
df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1]) | |
df_train_list.append(df_tmp) | |
df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True) | |
# Save space | |
del df_videos['split'] | |
df_videos['label'] = df_videos['label'] == 'FAKE' | |
df_videos['original'] = df_videos['original'].astype('category') | |
df_videos['folder'] = df_videos['folder'].astype(np.uint8) | |
# Collect metadata | |
paths_arr = np.asarray(df_videos.path.map(lambda x: str(source_dir.joinpath(x)))) | |
height_list = [] | |
width_list = [] | |
frames_list = [] | |
with Pool() as pool: | |
for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos), step=batch_size), desc='Metadata'): | |
batch_res = pool.map(extract_meta_av, paths_arr[batch_idx0:batch_idx0 + batch_size]) | |
for res in batch_res: | |
height_list.append(res[0]) | |
width_list.append(res[1]) | |
frames_list.append(res[2]) | |
df_videos['height'] = np.asarray(height_list, dtype=np.uint16) | |
df_videos['width'] = np.asarray(width_list, dtype=np.uint16) | |
df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16) | |
print('Saving video DataFrame to {}'.format(videodataset_path)) | |
df_videos.to_pickle(str(videodataset_path)) | |
print('Real videos: {:d}'.format(sum(df_videos['label'] == 0))) | |
print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1))) | |
if __name__ == '__main__': | |
main(sys.argv[1:]) | |