In [1]:
%cd ..

/admin/home-devforfu/realfake


In [4]:
import json
from pathlib import Path
import pandas as pd
from realfake.utils import read_jsonl, write_jsonl

In [6]:
df_all = pd.DataFrame(read_jsonl("metadata/all.jsonl"))

In [7]:
df_fail = pd.DataFrame(read_jsonl("metadata/all.failed.jsonl"))

In [8]:
df_all.head(3)

Unnamed: 0,path,label,class
0,/fsx/home-devforfu/data/real_imagenet1k/n02797...,real,n02797295
1,/fsx/home-devforfu/data/real_imagenet1k/n02797...,real,n02797295
2,/fsx/home-devforfu/data/real_imagenet1k/n02797...,real,n02797295


In [9]:
df_ok = df_all[~df_all.path.isin(df_fail.path)].reset_index(drop=True)

In [10]:
df_ok.sample(3)

Unnamed: 0,path,label,class
1517638,/fsx/home-devforfu/data/fake_imagenet1k/n02027...,fake,n02027492
1026755,/fsx/home-devforfu/data/real_imagenet1k/n01669...,real,n01669191
7790495,/fsx/home-devforfu/data/fake_2m_all/d8713853-0...,fake,


In [11]:
df_ok["label"].value_counts()

real    4184273
fake    4160720
Name: label, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

def create_metadata(dataset, test_size: float = 0.1, sample: int = None, seed: int = 1):
    if sample is not None:
        real = dataset[dataset["label"] == "real"].sample(sample)
        fake = dataset[dataset["label"] == "fake"].sample(sample)
        dataset = pd.concat([real, fake])
    
    imagenet_classes = dataset["class"].dropna().unique()
    
    trn, val = train_test_split(imagenet_classes, test_size=test_size, random_state=seed)
    trn_data = dataset[dataset["class"].isin(trn)]
    val_data = dataset[dataset["class"].isin(val)]

    no_class = dataset[dataset["class"].isna()]
    trn_data_null, val_data_null = train_test_split(no_class, test_size=test_size, random_state=seed)
    
    trn_data = pd.concat([trn_data, trn_data_null])
    trn_data["valid"] = False
    val_data = pd.concat([val_data, val_data_null])
    val_data["valid"] = True
    
    assert not set(trn_data["class"].dropna()).intersection(val_data["class"].dropna())
    
    return pd.concat([trn_data, val_data])

In [13]:
n = 1_000_000

In [14]:
df = create_metadata(df_ok, sample=n)
df

Unnamed: 0,path,label,class,valid
135038,/fsx/home-devforfu/data/real_imagenet1k/n01917...,real,n01917289,False
803039,/fsx/home-devforfu/data/real_imagenet1k/n01697...,real,n01697457,False
1280747,/fsx/home-devforfu/data/real_imagenet1k/n02992...,real,n02992211,False
130185,/fsx/home-devforfu/data/real_imagenet1k/n04599...,real,n04599235,False
701554,/fsx/home-devforfu/data/real_imagenet1k/n02108...,real,n02108000,False
...,...,...,...,...
7879868,/fsx/home-devforfu/data/fake_2m_all/3cf77f54-2...,fake,,True
3542472,/fsx/home-devforfu/data/real_aes_400_700/00485...,real,,True
6454613,/fsx/home-devforfu/data/fake_2m_all/1e7c20a8-8...,fake,,True
5466667,/fsx/home-devforfu/data/real_aes_400_700/00441...,real,,True


In [20]:
filename = "prepared.jsonl" if n is None else f"prepared.{2*n//1000}k.jsonl" 
filename = f"metadata/{filename}"
filename

'metadata/prepared.2000k.jsonl'

In [37]:
write_jsonl(filename, df.to_dict("records"))

In [21]:
df = pd.DataFrame(read_jsonl(filename))

In [22]:
df.shape

(2000000, 4)

In [23]:
df["valid"].value_counts(normalize=True)

False    0.899385
True     0.100614
Name: valid, dtype: float64