{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a0dae3b6-0612-4744-a466-5c8be9c62923", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/admin/home-devforfu/realfake\n" ] } ], "source": [ "%cd .." ] }, { "cell_type": "code", "execution_count": 4, "id": "4b5f4d1b-40a8-4a61-88ea-502103368b1c", "metadata": { "tags": [] }, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "import pandas as pd\n", "from realfake.utils import read_jsonl, write_jsonl" ] }, { "cell_type": "code", "execution_count": 6, "id": "df083619-aa93-49b5-aa76-1d2680062927", "metadata": { "tags": [] }, "outputs": [], "source": [ "df_all = pd.DataFrame(read_jsonl(\"metadata/all.jsonl\"))" ] }, { "cell_type": "code", "execution_count": 7, "id": "f91769e7-a4b9-4012-9079-441c364d32b3", "metadata": { "tags": [] }, "outputs": [], "source": [ "df_fail = pd.DataFrame(read_jsonl(\"metadata/all.failed.jsonl\"))" ] }, { "cell_type": "code", "execution_count": 8, "id": "88c02485-fdf4-42c0-8896-70f43bfaf76f", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pathlabelclass
0/fsx/home-devforfu/data/real_imagenet1k/n02797...realn02797295
1/fsx/home-devforfu/data/real_imagenet1k/n02797...realn02797295
2/fsx/home-devforfu/data/real_imagenet1k/n02797...realn02797295
\n", "
" ], "text/plain": [ " path label class\n", "0 /fsx/home-devforfu/data/real_imagenet1k/n02797... real n02797295\n", "1 /fsx/home-devforfu/data/real_imagenet1k/n02797... real n02797295\n", "2 /fsx/home-devforfu/data/real_imagenet1k/n02797... real n02797295" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_all.head(3)" ] }, { "cell_type": "code", "execution_count": 9, "id": "9bbd0247-1f34-4adc-8647-525792e6d3e5", "metadata": { "tags": [] }, "outputs": [], "source": [ "df_ok = df_all[~df_all.path.isin(df_fail.path)].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 10, "id": "77b400a8-83aa-4e38-983a-d56b71245ac9", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pathlabelclass
1517638/fsx/home-devforfu/data/fake_imagenet1k/n02027...faken02027492
1026755/fsx/home-devforfu/data/real_imagenet1k/n01669...realn01669191
7790495/fsx/home-devforfu/data/fake_2m_all/d8713853-0...fakeNone
\n", "
" ], "text/plain": [ " path label class\n", "1517638 /fsx/home-devforfu/data/fake_imagenet1k/n02027... fake n02027492\n", "1026755 /fsx/home-devforfu/data/real_imagenet1k/n01669... real n01669191\n", "7790495 /fsx/home-devforfu/data/fake_2m_all/d8713853-0... fake None" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ok.sample(3)" ] }, { "cell_type": "code", "execution_count": 11, "id": "321ac973-5b3b-4626-aa95-e4306680099e", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "real 4184273\n", "fake 4160720\n", "Name: label, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ok[\"label\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "id": "b0c8b90b-49b3-4aaf-89f2-be2277459ec7", "metadata": { "tags": [] }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "def create_metadata(dataset, test_size: float = 0.1, sample: int = None, seed: int = 1):\n", " if sample is not None:\n", " real = dataset[dataset[\"label\"] == \"real\"].sample(sample)\n", " fake = dataset[dataset[\"label\"] == \"fake\"].sample(sample)\n", " dataset = pd.concat([real, fake])\n", " \n", " imagenet_classes = dataset[\"class\"].dropna().unique()\n", " \n", " trn, val = train_test_split(imagenet_classes, test_size=test_size, random_state=seed)\n", " trn_data = dataset[dataset[\"class\"].isin(trn)]\n", " val_data = dataset[dataset[\"class\"].isin(val)]\n", "\n", " no_class = dataset[dataset[\"class\"].isna()]\n", " trn_data_null, val_data_null = train_test_split(no_class, test_size=test_size, random_state=seed)\n", " \n", " trn_data = pd.concat([trn_data, trn_data_null])\n", " trn_data[\"valid\"] = False\n", " val_data = pd.concat([val_data, val_data_null])\n", " val_data[\"valid\"] = True\n", " \n", " assert not set(trn_data[\"class\"].dropna()).intersection(val_data[\"class\"].dropna())\n", " \n", " return pd.concat([trn_data, val_data])" ] }, { "cell_type": "code", "execution_count": 13, "id": "ae4e4d48-9f81-4ea8-bb84-592b50fef3a9", "metadata": { "tags": [] }, "outputs": [], "source": [ "n = 1_000_000" ] }, { "cell_type": "code", "execution_count": 14, "id": "9ee41625-a30e-4eb9-8989-bda898094a83", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pathlabelclassvalid
135038/fsx/home-devforfu/data/real_imagenet1k/n01917...realn01917289False
803039/fsx/home-devforfu/data/real_imagenet1k/n01697...realn01697457False
1280747/fsx/home-devforfu/data/real_imagenet1k/n02992...realn02992211False
130185/fsx/home-devforfu/data/real_imagenet1k/n04599...realn04599235False
701554/fsx/home-devforfu/data/real_imagenet1k/n02108...realn02108000False
...............
7879868/fsx/home-devforfu/data/fake_2m_all/3cf77f54-2...fakeNoneTrue
3542472/fsx/home-devforfu/data/real_aes_400_700/00485...realNoneTrue
6454613/fsx/home-devforfu/data/fake_2m_all/1e7c20a8-8...fakeNoneTrue
5466667/fsx/home-devforfu/data/real_aes_400_700/00441...realNoneTrue
6469539/fsx/home-devforfu/data/fake_2m_all/1b126896-e...fakeNoneTrue
\n", "

2000000 rows × 4 columns

\n", "
" ], "text/plain": [ " path label class \\\n", "135038 /fsx/home-devforfu/data/real_imagenet1k/n01917... real n01917289 \n", "803039 /fsx/home-devforfu/data/real_imagenet1k/n01697... real n01697457 \n", "1280747 /fsx/home-devforfu/data/real_imagenet1k/n02992... real n02992211 \n", "130185 /fsx/home-devforfu/data/real_imagenet1k/n04599... real n04599235 \n", "701554 /fsx/home-devforfu/data/real_imagenet1k/n02108... real n02108000 \n", "... ... ... ... \n", "7879868 /fsx/home-devforfu/data/fake_2m_all/3cf77f54-2... fake None \n", "3542472 /fsx/home-devforfu/data/real_aes_400_700/00485... real None \n", "6454613 /fsx/home-devforfu/data/fake_2m_all/1e7c20a8-8... fake None \n", "5466667 /fsx/home-devforfu/data/real_aes_400_700/00441... real None \n", "6469539 /fsx/home-devforfu/data/fake_2m_all/1b126896-e... fake None \n", "\n", " valid \n", "135038 False \n", "803039 False \n", "1280747 False \n", "130185 False \n", "701554 False \n", "... ... \n", "7879868 True \n", "3542472 True \n", "6454613 True \n", "5466667 True \n", "6469539 True \n", "\n", "[2000000 rows x 4 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = create_metadata(df_ok, sample=n)\n", "df" ] }, { "cell_type": "code", "execution_count": 20, "id": "3094035a-cdf2-4d81-bbaf-d8d185150a27", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'metadata/prepared.2000k.jsonl'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filename = \"prepared.jsonl\" if n is None else f\"prepared.{2*n//1000}k.jsonl\" \n", "filename = f\"metadata/{filename}\"\n", "filename" ] }, { "cell_type": "code", "execution_count": 37, "id": "bc990be8-16ca-4a3b-bbc6-eaa652d46d81", "metadata": { "tags": [] }, "outputs": [], "source": [ "write_jsonl(filename, df.to_dict(\"records\"))" ] }, { "cell_type": "code", "execution_count": 21, "id": "7f6a3549-affe-459b-bf4a-bb8e2bc0ac62", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(read_jsonl(filename))" ] }, { "cell_type": "code", "execution_count": 22, "id": "f36ed292-4f7e-4876-a1ca-b6fc841227b8", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(2000000, 4)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 23, "id": "b72d0d93-d265-4149-b816-7a62f7a5a17a", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "False 0.899385\n", "True 0.100614\n", "Name: valid, dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"valid\"].value_counts(normalize=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }