{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a0dae3b6-0612-4744-a466-5c8be9c62923",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/admin/home-devforfu/realfake\n"
]
}
],
"source": [
"%cd .."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4b5f4d1b-40a8-4a61-88ea-502103368b1c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"from realfake.utils import read_jsonl, write_jsonl"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "df083619-aa93-49b5-aa76-1d2680062927",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df_all = pd.DataFrame(read_jsonl(\"metadata/all.jsonl\"))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f91769e7-a4b9-4012-9079-441c364d32b3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df_fail = pd.DataFrame(read_jsonl(\"metadata/all.failed.jsonl\"))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "88c02485-fdf4-42c0-8896-70f43bfaf76f",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" path | \n",
" label | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n02797... | \n",
" real | \n",
" n02797295 | \n",
"
\n",
" \n",
" 1 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n02797... | \n",
" real | \n",
" n02797295 | \n",
"
\n",
" \n",
" 2 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n02797... | \n",
" real | \n",
" n02797295 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" path label class\n",
"0 /fsx/home-devforfu/data/real_imagenet1k/n02797... real n02797295\n",
"1 /fsx/home-devforfu/data/real_imagenet1k/n02797... real n02797295\n",
"2 /fsx/home-devforfu/data/real_imagenet1k/n02797... real n02797295"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9bbd0247-1f34-4adc-8647-525792e6d3e5",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df_ok = df_all[~df_all.path.isin(df_fail.path)].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "77b400a8-83aa-4e38-983a-d56b71245ac9",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" path | \n",
" label | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 1517638 | \n",
" /fsx/home-devforfu/data/fake_imagenet1k/n02027... | \n",
" fake | \n",
" n02027492 | \n",
"
\n",
" \n",
" 1026755 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n01669... | \n",
" real | \n",
" n01669191 | \n",
"
\n",
" \n",
" 7790495 | \n",
" /fsx/home-devforfu/data/fake_2m_all/d8713853-0... | \n",
" fake | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" path label class\n",
"1517638 /fsx/home-devforfu/data/fake_imagenet1k/n02027... fake n02027492\n",
"1026755 /fsx/home-devforfu/data/real_imagenet1k/n01669... real n01669191\n",
"7790495 /fsx/home-devforfu/data/fake_2m_all/d8713853-0... fake None"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ok.sample(3)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "321ac973-5b3b-4626-aa95-e4306680099e",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"real 4184273\n",
"fake 4160720\n",
"Name: label, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ok[\"label\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b0c8b90b-49b3-4aaf-89f2-be2277459ec7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"def create_metadata(dataset, test_size: float = 0.1, sample: int = None, seed: int = 1):\n",
" if sample is not None:\n",
" real = dataset[dataset[\"label\"] == \"real\"].sample(sample)\n",
" fake = dataset[dataset[\"label\"] == \"fake\"].sample(sample)\n",
" dataset = pd.concat([real, fake])\n",
" \n",
" imagenet_classes = dataset[\"class\"].dropna().unique()\n",
" \n",
" trn, val = train_test_split(imagenet_classes, test_size=test_size, random_state=seed)\n",
" trn_data = dataset[dataset[\"class\"].isin(trn)]\n",
" val_data = dataset[dataset[\"class\"].isin(val)]\n",
"\n",
" no_class = dataset[dataset[\"class\"].isna()]\n",
" trn_data_null, val_data_null = train_test_split(no_class, test_size=test_size, random_state=seed)\n",
" \n",
" trn_data = pd.concat([trn_data, trn_data_null])\n",
" trn_data[\"valid\"] = False\n",
" val_data = pd.concat([val_data, val_data_null])\n",
" val_data[\"valid\"] = True\n",
" \n",
" assert not set(trn_data[\"class\"].dropna()).intersection(val_data[\"class\"].dropna())\n",
" \n",
" return pd.concat([trn_data, val_data])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ae4e4d48-9f81-4ea8-bb84-592b50fef3a9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"n = 1_000_000"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "9ee41625-a30e-4eb9-8989-bda898094a83",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" path | \n",
" label | \n",
" class | \n",
" valid | \n",
"
\n",
" \n",
" \n",
" \n",
" 135038 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n01917... | \n",
" real | \n",
" n01917289 | \n",
" False | \n",
"
\n",
" \n",
" 803039 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n01697... | \n",
" real | \n",
" n01697457 | \n",
" False | \n",
"
\n",
" \n",
" 1280747 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n02992... | \n",
" real | \n",
" n02992211 | \n",
" False | \n",
"
\n",
" \n",
" 130185 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n04599... | \n",
" real | \n",
" n04599235 | \n",
" False | \n",
"
\n",
" \n",
" 701554 | \n",
" /fsx/home-devforfu/data/real_imagenet1k/n02108... | \n",
" real | \n",
" n02108000 | \n",
" False | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 7879868 | \n",
" /fsx/home-devforfu/data/fake_2m_all/3cf77f54-2... | \n",
" fake | \n",
" None | \n",
" True | \n",
"
\n",
" \n",
" 3542472 | \n",
" /fsx/home-devforfu/data/real_aes_400_700/00485... | \n",
" real | \n",
" None | \n",
" True | \n",
"
\n",
" \n",
" 6454613 | \n",
" /fsx/home-devforfu/data/fake_2m_all/1e7c20a8-8... | \n",
" fake | \n",
" None | \n",
" True | \n",
"
\n",
" \n",
" 5466667 | \n",
" /fsx/home-devforfu/data/real_aes_400_700/00441... | \n",
" real | \n",
" None | \n",
" True | \n",
"
\n",
" \n",
" 6469539 | \n",
" /fsx/home-devforfu/data/fake_2m_all/1b126896-e... | \n",
" fake | \n",
" None | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
2000000 rows × 4 columns
\n",
"
"
],
"text/plain": [
" path label class \\\n",
"135038 /fsx/home-devforfu/data/real_imagenet1k/n01917... real n01917289 \n",
"803039 /fsx/home-devforfu/data/real_imagenet1k/n01697... real n01697457 \n",
"1280747 /fsx/home-devforfu/data/real_imagenet1k/n02992... real n02992211 \n",
"130185 /fsx/home-devforfu/data/real_imagenet1k/n04599... real n04599235 \n",
"701554 /fsx/home-devforfu/data/real_imagenet1k/n02108... real n02108000 \n",
"... ... ... ... \n",
"7879868 /fsx/home-devforfu/data/fake_2m_all/3cf77f54-2... fake None \n",
"3542472 /fsx/home-devforfu/data/real_aes_400_700/00485... real None \n",
"6454613 /fsx/home-devforfu/data/fake_2m_all/1e7c20a8-8... fake None \n",
"5466667 /fsx/home-devforfu/data/real_aes_400_700/00441... real None \n",
"6469539 /fsx/home-devforfu/data/fake_2m_all/1b126896-e... fake None \n",
"\n",
" valid \n",
"135038 False \n",
"803039 False \n",
"1280747 False \n",
"130185 False \n",
"701554 False \n",
"... ... \n",
"7879868 True \n",
"3542472 True \n",
"6454613 True \n",
"5466667 True \n",
"6469539 True \n",
"\n",
"[2000000 rows x 4 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = create_metadata(df_ok, sample=n)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "3094035a-cdf2-4d81-bbaf-d8d185150a27",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'metadata/prepared.2000k.jsonl'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filename = \"prepared.jsonl\" if n is None else f\"prepared.{2*n//1000}k.jsonl\" \n",
"filename = f\"metadata/{filename}\"\n",
"filename"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "bc990be8-16ca-4a3b-bbc6-eaa652d46d81",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"write_jsonl(filename, df.to_dict(\"records\"))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7f6a3549-affe-459b-bf4a-bb8e2bc0ac62",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(read_jsonl(filename))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "f36ed292-4f7e-4876-a1ca-b6fc841227b8",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"(2000000, 4)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "b72d0d93-d265-4149-b816-7a62f7a5a17a",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"False 0.899385\n",
"True 0.100614\n",
"Name: valid, dtype: float64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"valid\"].value_counts(normalize=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}