{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "df45e4ec-e732-4cf4-8017-ce3753f4cd48", "metadata": { "tags": [] }, "outputs": [], "source": [ "%cd .." ] }, { "cell_type": "code", "execution_count": null, "id": "67eda6f2-e2e5-495a-8795-25365d46c081", "metadata": { "tags": [] }, "outputs": [], "source": [ "import re\n", "from pathlib import Path\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from realfake.utils import list_files, write_jsonl" ] }, { "cell_type": "code", "execution_count": null, "id": "c6e57858-c5f1-4ef9-a69b-31e402215564", "metadata": { "tags": [] }, "outputs": [], "source": [ "np.random.seed(1)" ] }, { "cell_type": "code", "execution_count": null, "id": "5f8e03d2-039d-4aaf-b8bd-173d04b8d888", "metadata": { "tags": [] }, "outputs": [], "source": [ "root = Path(\"/fsx/home-devforfu/data\")\n", "laion, movie1, movie2 = [list_files(root/subdir, [\"jpg\"]) for subdir in (\"laionimages\", \"shotcafe\", \"pack1\")]" ] }, { "cell_type": "code", "execution_count": null, "id": "1766851a-e1f6-46af-98a1-700aa53dd240", "metadata": {}, "outputs": [], "source": [ "len(movie2)" ] }, { "cell_type": "code", "execution_count": null, "id": "dcb3b8ab-6012-4e78-958b-aef1bc743ef9", "metadata": { "tags": [] }, "outputs": [], "source": [ "n_test = 0.1" ] }, { "cell_type": "code", "execution_count": null, "id": "419062b0-e594-4644-87f2-180ba864587e", "metadata": { "tags": [] }, "outputs": [], "source": [ "tst = set(train_test_split(np.arange(len(laion)), test_size=n_test)[1])\n", "metadata = [{\"path\": str(fn), \"label\": \"real\", \"valid\": i in tst} for i, fn in enumerate(laion)]\n", "tst = set(train_test_split(np.arange(len(movie1)), test_size=n_test)[1])\n", "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": i in tst} for i, fn in enumerate(movie1)]" ] }, { "cell_type": "code", "execution_count": null, "id": "8f6ed004-8932-4ce9-b4cb-32d0c5c47da7", "metadata": { "tags": [] }, "outputs": [], "source": [ "from collections import defaultdict\n", "movie_to_frame = defaultdict(list)\n", "for fn in movie2:\n", " movie_name = re.search(\"((?:[a-zA-Z]+|[0-9]+))\", fn.stem).group(1)\n", " movie_to_frame[movie_name].append(fn)" ] }, { "cell_type": "code", "execution_count": null, "id": "bb06edea-419c-4e3f-b9dd-f4869e05d3a9", "metadata": { "tags": [] }, "outputs": [], "source": [ "trn_keys, tst_keys = train_test_split(list(set(movie_to_frame)), test_size=n_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "6ac623e7-c9ae-4467-9fc7-f363adae6e57", "metadata": { "tags": [] }, "outputs": [], "source": [ "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": False} for key in trn_keys for fn in movie_to_frame[key]]\n", "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": True} for key in tst_keys for fn in movie_to_frame[key]]" ] }, { "cell_type": "code", "execution_count": null, "id": "a70b6b30-fdbb-49cb-b10f-34547c4072bb", "metadata": { "tags": [] }, "outputs": [], "source": [ "df = pd.DataFrame(metadata)\n", "print(df.valid.value_counts())\n", "print(df.label.value_counts())" ] }, { "cell_type": "code", "execution_count": null, "id": "737a3a2f-428d-490f-a8a9-def76f094ce0", "metadata": { "tags": [] }, "outputs": [], "source": [ "pos_weight = (1 - df.label.value_counts(normalize=True)).tolist()\n", "pos_weight" ] }, { "cell_type": "code", "execution_count": null, "id": "72e03454-cec8-4b58-ba3a-f1fe7d59ee8e", "metadata": { "tags": [] }, "outputs": [], "source": [ "write_jsonl(\"metadata/movies_plus.jsonl\", metadata)" ] }, { "cell_type": "code", "execution_count": null, "id": "41c061b1-7f8d-43d5-9b52-88c6144d0bda", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }