{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d6299e4c-f1ba-4be4-ac89-63c05287387c", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/admin/home-devforfu/realfake\n" ] } ], "source": [ "%cd .." ] }, { "cell_type": "code", "execution_count": 20, "id": "8729a815-81b7-4667-bb97-c85456ad86e8", "metadata": { "tags": [] }, "outputs": [], "source": [ "from pathlib import Path\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from realfake.utils import list_files, write_jsonl" ] }, { "cell_type": "code", "execution_count": 3, "id": "82fa1fee-3ea0-415e-8a6a-8622bc55fabd", "metadata": { "tags": [] }, "outputs": [], "source": [ "root = Path(\"/fsx/home-devforfu/data\")\n", "laion = list_files(root/\"laionimages\", [\"jpg\"])\n", "movie = list_files(root/\"shotcafe\", [\"jpg\"])" ] }, { "cell_type": "code", "execution_count": 4, "id": "58d79304-bdf0-445d-b5dd-46bd2a7bf1ec", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(5133, 7526)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(laion), len(movie)" ] }, { "cell_type": "code", "execution_count": 12, "id": "f78ced08-3a5a-48f8-a635-6d93e82856c1", "metadata": { "tags": [] }, "outputs": [], "source": [ "n_test = 0.1" ] }, { "cell_type": "code", "execution_count": 19, "id": "7a89dac3-cc8f-46cf-b60a-9118121ff69a", "metadata": { "tags": [] }, "outputs": [], "source": [ "tst = set(train_test_split(np.arange(len(laion)), test_size=n_test)[1])\n", "metadata = [{\"path\": str(fn), \"label\": \"real\", \"valid\": i in tst} for i, fn in enumerate(laion)]\n", "tst = set(train_test_split(np.arange(len(movie)), test_size=n_test)[1])\n", "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": i in tst} for i, fn in enumerate(movie)]" ] }, { "cell_type": "code", "execution_count": 25, "id": "b0da02f5-eec6-4114-81fa-901aef933d72", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "False 11392\n", "True 1267\n", "Name: valid, dtype: int64\n", "fake 7526\n", "real 5133\n", "Name: label, dtype: int64\n" ] } ], "source": [ "df = pd.DataFrame(metadata)\n", "print(df.valid.value_counts())\n", "print(df.label.value_counts())" ] }, { "cell_type": "code", "execution_count": 26, "id": "bdad7b5a-17de-424b-8bfb-d13fe197552b", "metadata": { "tags": [] }, "outputs": [], "source": [ "write_jsonl(\"metadata/movies.jsonl\", metadata)" ] }, { "cell_type": "code", "execution_count": null, "id": "428dda09-77b8-4f6c-b14e-059e9d281f9a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }