realfakerepo
/

realfake

Model card Files Files and versions Community

devforfu commited on Mar 2, 2023

Commit

ea847ad

•

0 Parent(s):

Init

Browse files

Files changed (17) hide show

.gitignore +135 -0
nbs/prepare.ipynb +568 -0
realfake/bin/check_files.py +51 -0
realfake/bin/create_metadata.py +57 -0
realfake/bin/diffusion_db.py +36 -0
realfake/bin/download_s3.py +71 -0
realfake/bin/imagenet.py +32 -0
realfake/bin/inference.py +49 -0
realfake/bin/unpack_diffusion_db.py +41 -0
realfake/callbacks.py +65 -0
realfake/config.py +4 -0
realfake/data.py +101 -0
realfake/models.py +110 -0
realfake/train.py +68 -0
realfake/train_cluster.py +33 -0
realfake/utils.py +121 -0
submit.sh +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,135 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+**/lightning_logs
+.*
+!.gitignore
+*.out

nbs/prepare.ipynb ADDED Viewed

	@@ -0,0 +1,568 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a0dae3b6-0612-4744-a466-5c8be9c62923",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/admin/home-devforfu/realfake\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4b5f4d1b-40a8-4a61-88ea-502103368b1c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "from realfake.utils import read_jsonl, write_jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "df083619-aa93-49b5-aa76-1d2680062927",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df_all = pd.DataFrame(read_jsonl(\"metadata/all.jsonl\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f91769e7-a4b9-4012-9079-441c364d32b3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df_fail = pd.DataFrame(read_jsonl(\"metadata/all.failed.jsonl\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "88c02485-fdf4-42c0-8896-70f43bfaf76f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>path</th>\n",
+       "      <th>label</th>\n",
+       "      <th>class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n02797...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n02797295</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n02797...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n02797295</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n02797...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n02797295</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                path label      class\n",
+       "0  /fsx/home-devforfu/data/real_imagenet1k/n02797...  real  n02797295\n",
+       "1  /fsx/home-devforfu/data/real_imagenet1k/n02797...  real  n02797295\n",
+       "2  /fsx/home-devforfu/data/real_imagenet1k/n02797...  real  n02797295"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_all.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9bbd0247-1f34-4adc-8647-525792e6d3e5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df_ok = df_all[~df_all.path.isin(df_fail.path)].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "77b400a8-83aa-4e38-983a-d56b71245ac9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>path</th>\n",
+       "      <th>label</th>\n",
+       "      <th>class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1517638</th>\n",
+       "      <td>/fsx/home-devforfu/data/fake_imagenet1k/n02027...</td>\n",
+       "      <td>fake</td>\n",
+       "      <td>n02027492</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1026755</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n01669...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n01669191</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7790495</th>\n",
+       "      <td>/fsx/home-devforfu/data/fake_2m_all/d8713853-0...</td>\n",
+       "      <td>fake</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                      path label      class\n",
+       "1517638  /fsx/home-devforfu/data/fake_imagenet1k/n02027...  fake  n02027492\n",
+       "1026755  /fsx/home-devforfu/data/real_imagenet1k/n01669...  real  n01669191\n",
+       "7790495  /fsx/home-devforfu/data/fake_2m_all/d8713853-0...  fake       None"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_ok.sample(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "321ac973-5b3b-4626-aa95-e4306680099e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "real    4184273\n",
+       "fake    4160720\n",
+       "Name: label, dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_ok[\"label\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b0c8b90b-49b3-4aaf-89f2-be2277459ec7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "def create_metadata(dataset, test_size: float = 0.1, sample: int = None, seed: int = 1):\n",
+    "    if sample is not None:\n",
+    "        real = dataset[dataset[\"label\"] == \"real\"].sample(sample)\n",
+    "        fake = dataset[dataset[\"label\"] == \"fake\"].sample(sample)\n",
+    "        dataset = pd.concat([real, fake])\n",
+    "    \n",
+    "    imagenet_classes = dataset[\"class\"].dropna().unique()\n",
+    "    \n",
+    "    trn, val = train_test_split(imagenet_classes, test_size=test_size, random_state=seed)\n",
+    "    trn_data = dataset[dataset[\"class\"].isin(trn)]\n",
+    "    val_data = dataset[dataset[\"class\"].isin(val)]\n",
+    "\n",
+    "    no_class = dataset[dataset[\"class\"].isna()]\n",
+    "    trn_data_null, val_data_null = train_test_split(no_class, test_size=test_size, random_state=seed)\n",
+    "    \n",
+    "    trn_data = pd.concat([trn_data, trn_data_null])\n",
+    "    trn_data[\"valid\"] = False\n",
+    "    val_data = pd.concat([val_data, val_data_null])\n",
+    "    val_data[\"valid\"] = True\n",
+    "    \n",
+    "    assert not set(trn_data[\"class\"].dropna()).intersection(val_data[\"class\"].dropna())\n",
+    "    \n",
+    "    return pd.concat([trn_data, val_data])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ae4e4d48-9f81-4ea8-bb84-592b50fef3a9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "n = 1_000_000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9ee41625-a30e-4eb9-8989-bda898094a83",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>path</th>\n",
+       "      <th>label</th>\n",
+       "      <th>class</th>\n",
+       "      <th>valid</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>135038</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n01917...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n01917289</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>803039</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n01697...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n01697457</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1280747</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n02992...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n02992211</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>130185</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n04599...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n04599235</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>701554</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_imagenet1k/n02108...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>n02108000</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7879868</th>\n",
+       "      <td>/fsx/home-devforfu/data/fake_2m_all/3cf77f54-2...</td>\n",
+       "      <td>fake</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3542472</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_aes_400_700/00485...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6454613</th>\n",
+       "      <td>/fsx/home-devforfu/data/fake_2m_all/1e7c20a8-8...</td>\n",
+       "      <td>fake</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5466667</th>\n",
+       "      <td>/fsx/home-devforfu/data/real_aes_400_700/00441...</td>\n",
+       "      <td>real</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6469539</th>\n",
+       "      <td>/fsx/home-devforfu/data/fake_2m_all/1b126896-e...</td>\n",
+       "      <td>fake</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2000000 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                      path label      class  \\\n",
+       "135038   /fsx/home-devforfu/data/real_imagenet1k/n01917...  real  n01917289   \n",
+       "803039   /fsx/home-devforfu/data/real_imagenet1k/n01697...  real  n01697457   \n",
+       "1280747  /fsx/home-devforfu/data/real_imagenet1k/n02992...  real  n02992211   \n",
+       "130185   /fsx/home-devforfu/data/real_imagenet1k/n04599...  real  n04599235   \n",
+       "701554   /fsx/home-devforfu/data/real_imagenet1k/n02108...  real  n02108000   \n",
+       "...                                                    ...   ...        ...   \n",
+       "7879868  /fsx/home-devforfu/data/fake_2m_all/3cf77f54-2...  fake       None   \n",
+       "3542472  /fsx/home-devforfu/data/real_aes_400_700/00485...  real       None   \n",
+       "6454613  /fsx/home-devforfu/data/fake_2m_all/1e7c20a8-8...  fake       None   \n",
+       "5466667  /fsx/home-devforfu/data/real_aes_400_700/00441...  real       None   \n",
+       "6469539  /fsx/home-devforfu/data/fake_2m_all/1b126896-e...  fake       None   \n",
+       "\n",
+       "         valid  \n",
+       "135038   False  \n",
+       "803039   False  \n",
+       "1280747  False  \n",
+       "130185   False  \n",
+       "701554   False  \n",
+       "...        ...  \n",
+       "7879868   True  \n",
+       "3542472   True  \n",
+       "6454613   True  \n",
+       "5466667   True  \n",
+       "6469539   True  \n",
+       "\n",
+       "[2000000 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = create_metadata(df_ok, sample=n)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "3094035a-cdf2-4d81-bbaf-d8d185150a27",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'metadata/prepared.2000k.jsonl'"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filename = \"prepared.jsonl\" if n is None else f\"prepared.{2*n//1000}k.jsonl\" \n",
+    "filename = f\"metadata/{filename}\"\n",
+    "filename"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "bc990be8-16ca-4a3b-bbc6-eaa652d46d81",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "write_jsonl(filename, df.to_dict(\"records\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "7f6a3549-affe-459b-bf4a-bb8e2bc0ac62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(read_jsonl(filename))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "f36ed292-4f7e-4876-a1ca-b6fc841227b8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2000000, 4)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "b72d0d93-d265-4149-b816-7a62f7a5a17a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False    0.899385\n",
+       "True     0.100614\n",
+       "Name: valid, dtype: float64"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"valid\"].value_counts(normalize=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

realfake/bin/check_files.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Check that files that are references from JSONL file are valid.
+"""
+import json
+from pathlib import Path
+import numpy as np
+import PIL.Image
+from joblib import delayed, Parallel
+from realfake.utils import inject_args, Args, read_jsonl
+class CheckFilesArgs(Args):
+    jsonl_file: Path
+@inject_args
+def main(args: CheckFilesArgs) -> None:
+    records = read_jsonl(args.jsonl_file)
+    results = Parallel(n_jobs=-1, verbose=100)(delayed(check_file)(record) for record in records)
+    failed = [result for result in results if result["error"] is not None]
+    if not failed:
+        print("All files are valid")
+    else:
+        saved_file = args.jsonl_file.with_suffix(".failed.jsonl")
+        print(f"{len(failed)} files are invalid, saved errors to {saved_file}")
+        with open(saved_file, "w") as f:
+            for record in failed:
+                f.write(json.dumps(record) + "\n")
+def check_file(record: dict) -> dict:
+    path = Path(record["path"])
+    error = None
+    if not path.exists():
+        error = "File does not exist"
+    elif not path.is_file():
+        error = "Path is not a file"
+    elif path.suffix.lower() not in (".jpg", ".jpeg", ".png"):
+        error = "File is not an image file"
+    else:
+        try:
+            np.asarray(PIL.Image.open(path))
+        except Exception as e:
+            error = f"Image cannot be opened: {e}"
+    return dict(record, error=error)
+if __name__ == '__main__':
+    main()

realfake/bin/create_metadata.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Creates a meta-data file by combining the information from directory structure.
+"""
+import json
+from pathlib import Path
+from pydantic import Field
+from realfake.config import IMAGE_FORMATS
+from realfake.utils import inject_args, Args
+class CreateMetadataArgs(Args):
+    root_dir: Path
+    datasets: str = Field(..., help="Comma-separated list of datasets to include in the meta-data file")
+    jsonl_file: Path = Field(..., help="Path to the output JSONL file")
+@inject_args
+def main(args: CreateMetadataArgs) -> None:
+    datasets = args.datasets.split(",")
+    records = []
+    for dataset in datasets:
+        label = "real" if dataset.startswith("real") else "fake"
+        dirpath = args.root_dir/dataset
+        assert dirpath.exists(), f"dataset dir does not exist: {dirpath}"
+        records.extend((parse_imagenet if "imagenet" in dataset else parse_flat)(dirpath, label))
+    with open(args.jsonl_file, "w") as f:
+        for record in records:
+            f.write(json.dumps(record) + "\n")
+def parse_imagenet(dirpath: Path, label: str) -> list:
+    records = []
+    for classdir in dirpath.iterdir():
+        assert classdir.is_dir(), f"class directory is not a directory: {classdir}"
+        for fn in classdir.iterdir():
+            if fn.suffix.lower() in IMAGE_FORMATS:
+                records.append({"path": str(fn), "label": label, "class": classdir.name})
+            else:
+                print("Not an image file:", fn)
+    return records
+def parse_flat(dirpath: Path, label: str) -> list:
+    records = []
+    for fn in dirpath.iterdir():
+        if fn.suffix.lower() in IMAGE_FORMATS:
+            records.append({"path": str(fn), "label": label, "class": None})
+        else:
+            print("Not an image file:", fn)
+    return records
+if __name__ == "__main__":
+    main()

realfake/bin/diffusion_db.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import json
+from hashlib import md5
+from pathlib import Path
+import datasets
+from tqdm import tqdm
+from realfake.utils import Args, inject_args
+class DownloadParams(Args):
+    output_dir: Path
+    subset: str = "2m_first_1k"
+@inject_args
+def main(params: DownloadParams) -> None:
+    dataset = datasets.load_dataset("poloclub/diffusiondb", params.subset, split="train", streaming=True)
+    output_dir = params.output_dir/params.subset
+    output_dir.mkdir(parents=True, exist_ok=True)
+    with (output_dir/"test.jsonl").open("w") as fp:
+        for item in tqdm(dataset, total=None):
+            image_id = md5((item["prompt"] + str(item["seed"])).encode()).hexdigest()
+            filename = output_dir/f"{image_id}.png"
+            if not filename.exists():
+                item["image"].save(filename)
+            record = {"path": str(filename), "label": "fake", "class": None, "valid": False}
+            fp.write(f"{json.dumps(record)}\n")
+    print(f"Saved records to {output_dir}")
+if __name__ == "__main__":
+    main()

realfake/bin/download_s3.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from __future__ import annotations
+import tarfile
+from dataclasses import dataclass
+from pathlib import Path
+import boto3
+from joblib import Parallel, delayed
+from realfake.utils import get_user_name
+def main() -> None:
+    bucket, prefix = "s-datasets", "laion-aesthetic/data/laion2B-en-aesthetic/"
+    start_idx, end_idx = 400, 700
+    keys_range = list(range(start_idx, end_idx))
+    output_dir = Path(f"/fsx/{get_user_name()}/data/real_aes_{start_idx}_{end_idx}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    jobs = get_jobs(keys_range, bucket, prefix, output_dir)
+    Parallel(n_jobs=-1, backend="multiprocessing", verbose=100)(delayed(download_and_extract)(job) for job in jobs)
+@dataclass
+class Job:
+    bucket: str
+    key: Path
+    output_dir: Path
+def get_jobs(keys_range: list, bucket: str, prefix: str, output_dir: Path) -> list[Job]:
+    client = boto3.client("s3")
+    token, jobs = None, []
+    while True:
+        conf = dict(Bucket=bucket, Prefix=prefix)
+        if token is not None: conf["ContinuationToken"] = token
+        response = client.list_objects_v2(**conf)
+        for item in response.get("Contents"):
+            key = Path(item["Key"])
+            if key.suffix == ".tar" and int(key.stem) in keys_range:
+                jobs.append(Job(bucket, key, output_dir))
+        if not response["IsTruncated"]: break
+        token = response["NextContinuationToken"]
+    return jobs
+def download_and_extract(job: Job) -> None:
+    client = boto3.client("s3")
+    tar_file = job.output_dir / job.key.name
+    print(f"{job.key}: downloading...")
+    client.download_file(job.bucket, str(job.key), tar_file)
+    print(f"{job.key}: extracting...")
+    with tarfile.open(tar_file) as tar:
+        for name in tar.getnames():
+            if name.endswith(".jpg"):
+                tar.extract(name, job.output_dir)
+    print(f"{job.key}: done!")
+    tar_file.unlink()
+if __name__ == "__main__":
+    main()

realfake/bin/imagenet.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Unpacks tar files from Imagenet-1k dataset while keeping the original directory structure.
+The script only unpacks the files from training subset.
+"""
+import tarfile
+from pathlib import Path
+from joblib import delayed, Parallel
+from realfake.utils import inject_args, Args
+class ImagenetArgs(Args):
+    imagenet_dir: Path
+    unpacked_dir: Path
+@inject_args
+def main(args: ImagenetArgs) -> None:
+    train_dir = args.imagenet_dir/"train"
+    assert train_dir.exists(), f"Directory {train_dir} does not exist"
+    archives = train_dir.glob("*.tar")
+    Parallel(n_jobs=-1, verbose=100)(delayed(unpack_tar)(tar_file, args.unpacked_dir) for tar_file in archives)
+def unpack_tar(tar_file: Path, output_dir: Path) -> None:
+    output_subdir = output_dir/tar_file.stem
+    with tarfile.open(tar_file) as tar:
+        tar.extractall(output_subdir)
+if __name__ == "__main__":
+    main()

realfake/bin/inference.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import random
+from pathlib import Path
+import torch
+from torch.utils.data import DataLoader
+from realfake.data import DictDataset, get_augs
+from realfake.models import RealFakeClassifier, RealFakeParams
+from realfake.utils import Args, inject_args, read_jsonl
+class InferenceParams(Args):
+    checkpoint_path: Path
+    test_file: Path
+    map_location: str = "cpu"
+    num_workers: int = 16
+@inject_args
+def main(params: InferenceParams) -> None:
+    checkpoint = torch.load(params.checkpoint_path, map_location=params.map_location)
+    # todo: use PL mechanism to store hparams
+    model = RealFakeClassifier(RealFakeParams.parse_file(params.checkpoint_path.parent/"params.json"))
+    model.load_state_dict(checkpoint["state_dict"])
+    model.eval()
+    records = read_jsonl(params.test_file)
+    for _ in range(10):
+        selected = random.sample(records, k=1000)
+        with torch.inference_mode():
+            ds = DictDataset(selected, get_augs(train=False))
+            dl = DataLoader(ds, batch_size=32, num_workers=params.num_workers, shuffle=False)
+            matched, total = 0, len(ds)
+            for batch in dl:
+                _, logits, y_true_onehot = model(batch)
+                y_true = y_true_onehot.argmax(dim=1)
+                y_pred = logits.softmax(dim=1).argmax(dim=1)
+                matched += (y_true == y_pred).sum().item()
+        print(f"Accuracy: {matched/total:2.2%}")
+if __name__ == "__main__":
+    main()

realfake/bin/unpack_diffusion_db.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import json
+import zipfile
+from itertools import chain
+from pathlib import Path
+from joblib import Parallel, delayed
+from realfake.utils import get_user_name, inject_args, Args
+class UnpackParams(Args):
+    meta_file: Path
+    jsonl_file: Path
+    num_workers: int = 16
+def unpack(zip_path: Path, output_dir: Path):
+    print("extracting", zip_path)
+    with zipfile.ZipFile(zip_path, "r") as arch:
+        paths = [str(output_dir/fn) for fn in arch.namelist() if fn.endswith(".png")]
+        arch.extractall(output_dir)
+    return paths
+@inject_args
+def main(params: UnpackParams) -> None:
+    subset_name = params.meta_file.stem
+    output_dir = Path(f"/fsx/{get_user_name()}/data/fake_{subset_name}")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    meta = json.loads(params.meta_file.read_text())
+    with Parallel(n_jobs=params.num_workers, verbose=100) as parallel:
+        results = parallel(delayed(unpack)(Path(m["path"]), output_dir) for m in meta if m["ok"])
+    records = [
+        {"path": str(fn), "label": "fake", "class": None, "valid": None}
+        for fn in chain.from_iterable(results)
+    ]
+    with params.jsonl_file.open("w") as fp:
+        for record in records:
+            fp.write(json.dumps(record) + "\n")
+if __name__ == "__main__":
+    main()

realfake/callbacks.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Iterable
+import torch.nn as nn
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import BaseFinetuning, Callback
+from pytorch_lightning.utilities import rank_zero_info
+class ConsoleLogger(Callback):
+    def __init__(self):
+        super().__init__()
+        self._reset()
+    def get_history(self) -> list:
+        return list(self._history)
+    def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        self._reset()
+    def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        [lr] = trainer.lr_scheduler_configs[0].scheduler.get_last_lr()  # type: ignore
+        log = {"epoch": trainer.current_epoch, "lr": lr}
+        log.update({name: tensor.item() for name, tensor in trainer.logged_metrics.items()})
+        self._history.append(log)
+        formatted = []
+        for key, value in log.items():
+            if isinstance(value, int):
+                kv = f"{key}={value:3d}"
+            elif isinstance(value, float):
+                kv = f"{key}={value:.4f}"
+            else:
+                kv = f"{key}={value}"
+            formatted.append(kv)
+        rank_zero_info(" | ".join(formatted))
+    def _reset(self):
+        self._history = []
+class FeatureExtractorFreezeUnfreeze(BaseFinetuning):
+    def __init__(self, unfreeze_at_epoch: int):
+        super().__init__()
+        self._unfreeze_at_epoch = unfreeze_at_epoch
+    def freeze_before_training(self, pl_module: pl.LightningModule) -> None:
+        rank_zero_info("Freezing backbone")
+        self.freeze(_get_backbone(pl_module.model))
+    def finetune_function(self, pl_module: "pl.LightningModule", epoch: int, optimizer, opt_idx: int) -> None:
+        if epoch == self._unfreeze_at_epoch:
+            rank_zero_info(f"Unfreezing backbone at epoch {epoch}")
+            self.unfreeze_and_add_param_group(
+                modules=_get_backbone(pl_module.model),
+                optimizer=optimizer,
+                train_bn=True,
+            )
+def _get_backbone(module: pl.LightningModule) -> Iterable[nn.Module]:
+    for name, child in module.named_children():
+        if name.startswith("head"):
+            continue
+        yield child

realfake/config.py ADDED Viewed

	@@ -0,0 +1,4 @@

+SEED = 1
+LABELS = {"real": 0, "fake": 1}
+SUBSETS = ("train", "validation")
+IMAGE_FORMATS = ".jpeg", ".jpg", ".png"

realfake/data.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from __future__ import annotations
+import random
+import albumentations as A
+import numpy as np
+import PIL.Image
+from albumentations.pytorch.transforms import ToTensorV2
+from torch.utils.data import Dataset, DataLoader
+from realfake.config import LABELS
+IMG_RESIZE = 256
+IMG_CROP = 224
+class DictDataset(Dataset):
+    def __init__(self, records: list[dict], transform_x=None):
+        self.records = records
+        self.transform_x = transform_x
+    def __len__(self):
+        return len(self.records)
+    def __getitem__(self, idx):
+        record = self.records[idx]
+        image = np.asarray(PIL.Image.open(record["path"]))
+        if self.transform_x is not None:
+            image = self.transform_x(image=image)["image"]
+        item = {"image": image}
+        if "label" in record:
+            item["label"] = LABELS[record["label"]]
+        return item
+def get_augs(train: bool = True) -> A.Compose:
+    if train:
+        return A.Compose([
+            A.Resize(IMG_RESIZE, IMG_RESIZE),
+            A.RandomCrop(IMG_CROP, IMG_CROP),
+            A.HorizontalFlip(),
+            A.VerticalFlip(),
+            A.RandomBrightnessContrast(),
+            A.Affine(),
+            A.Rotate(),
+            A.CoarseDropout(),
+            ExpandChannels(),
+            RGBAtoRGB(),
+            A.Normalize(),
+            ToTensorV2(),
+        ])
+    else:
+        return A.Compose([
+            A.Resize(IMG_RESIZE, IMG_RESIZE),
+            A.CenterCrop(IMG_CROP, IMG_CROP),
+            ExpandChannels(),
+            RGBAtoRGB(),
+            A.Normalize(),
+            ToTensorV2(),
+        ])
+class ExpandChannels(A.ImageOnlyTransform):
+    """Expands image up to three channes if the image is grayscale."""
+    def __init__(self, always_apply: bool = False, p: float = 0.5):
+        super().__init__(True, 1.0)
+    def apply(self, image, **params):
+        if image.ndim == 2:
+            image = np.repeat(image[..., None], 3, axis=2)
+        elif image.shape[2] == 1:
+            image = np.repeat(image, 3, axis=2)
+        return image
+class RGBAtoRGB(A.ImageOnlyTransform):
+    """Converts RGBA image to RGB."""
+    def __init__(self, always_apply: bool = False, p: float = 0.5):
+        super().__init__(True, 1.0)
+    def apply(self, image, **params):
+        if image.shape[2] == 4:
+            image = image[:, :, :3]
+        return image
+def get_dss(records: list) -> tuple[DictDataset, DictDataset]:
+    train_records = [x for x in records if not x["valid"]]
+    valid_records = [x for x in records if x["valid"]]
+    assert len(train_records) + len(valid_records) == len(records)
+    random.shuffle(train_records)
+    train_ds = DictDataset(train_records, transform_x=get_augs(train=True))
+    valid_ds = DictDataset(valid_records, transform_x=get_augs(train=False))
+    return train_ds, valid_ds
+def get_dls(train_ds: DictDataset, valid_ds: DictDataset, bs: int, num_workers: int) -> tuple[DataLoader, DataLoader]:
+    train_dl = DataLoader(train_ds, batch_size=bs, num_workers=num_workers)
+    valid_dl = DataLoader(valid_ds, batch_size=bs, num_workers=num_workers, shuffle=False)
+    return train_dl, valid_dl

realfake/models.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+import pytorch_lightning as pl
+import timm
+import torch
+import torch.nn as nn
+import torchmetrics
+from pydantic import BaseModel, Field
+from realfake.data import get_dss, get_dls
+from realfake.utils import Args
+N_CLASSES = 2
+class AcceleratorParams(BaseModel):
+    """PyTorch Lightning accelerator parameters."""
+    name: str = Field("gpu")
+    devices: int = Field(4)
+    strategy: str = Field("dp")
+    precision: int = Field(16)
+    override_float32_matmul: bool = Field(True)
+    float32_matmul: str = Field("medium")
+class RealFakeParams(Args):
+    jsonl_file: Path
+    dry_run: bool = Field(False)
+    model_name: str = Field("convnext_tiny")
+    batch_size: int = Field(256)
+    freeze_epochs: int = Field(3)
+    epochs: int = Field(6)
+    base_lr: float = Field(1e-3)
+    pretrained: bool = Field(True)
+    accelerator: AcceleratorParams = Field(default_factory=AcceleratorParams)
+class RealFakeDataModule(pl.LightningDataModule):
+    def __init__(self, jsonl_records: Path, batch_size: int, num_workers: int = 0):
+        super().__init__()
+        self.jsonl_records = jsonl_records
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.dss = self.dls = None
+    def setup(self, stage=None):
+        records = [json.loads(line) for line in self.jsonl_records.open()]
+        self.dss = get_dss(records)
+        self.dls = get_dls(*self.dss, self.batch_size, self.num_workers)
+    def train_dataloader(self):
+        return self.dls[0]
+    def val_dataloader(self):
+        return self.dls[1]
+class RealFakeClassifier(pl.LightningModule):
+    def __init__(self, params: RealFakeParams):
+        super().__init__()
+        self.params = params
+        self.ce = nn.BCEWithLogitsLoss()
+        self.model = timm.create_model(params.model_name, pretrained=params.pretrained, num_classes=N_CLASSES)
+        self.acc = torchmetrics.Accuracy(task="binary")
+    def train_dataloader(self):
+        return self.dls.train
+    def val_dataloader(self):
+        return self.dls.valid
+    def forward(self, batch):
+        x, y = batch["image"], batch["label"]
+        y = torch.nn.functional.one_hot(y, num_classes=N_CLASSES).float()
+        out = self.model(x)
+        loss = self.ce(out, y)
+        return loss, out, y
+    def training_step(self, batch, batch_idx):
+        loss, _, _ = self.forward(batch)
+        self.log("train_loss", loss, on_epoch=True, on_step=False)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss, out, y = self.forward(batch)
+        y_pred = out.sigmoid().argmax(dim=-1)
+        y_true = y.argmax(dim=-1)
+        self.log("val_loss", loss, on_epoch=True, on_step=False)
+        return {"gt": y_true, "yhat": y_pred}
+    def validation_step_end(self, outputs):
+        self.acc.update(outputs["yhat"], outputs["gt"])
+    def validation_epoch_end(self, outputs):
+        self.log("val_acc", self.acc.compute(), on_epoch=True)
+        self.acc.reset()
+    def configure_optimizers(self):
+        adamw = torch.optim.AdamW(self.parameters(), lr=self.params.base_lr)
+        one_cycle = torch.optim.lr_scheduler.OneCycleLR(
+            adamw,
+            max_lr=self.params.base_lr,
+            total_steps=self.trainer.estimated_stepping_batches
+        )
+        return [adamw], [one_cycle]

realfake/train.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from __future__ import annotations
+import os
+import signal
+from pathlib import Path
+import torch
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.plugins.environments import SLURMEnvironment
+from realfake.callbacks import ConsoleLogger
+from realfake.models import RealFakeParams
+from realfake.utils import get_checkpoints_dir, find_latest_checkpoint
+def get_existing_checkpoint(job_id: str | None = None) -> tuple:
+    if job_id is None:
+        checkpoints_dir = get_checkpoints_dir(timestamp=True)
+    else:
+        checkpoints_dir = get_checkpoints_dir(timestamp=False)/job_id
+    checkpoints_dir.mkdir(parents=True, exist_ok=True)
+    existing_checkpoint = find_latest_checkpoint(checkpoints_dir)
+    return checkpoints_dir, existing_checkpoint
+def prepare_trainer(args: RealFakeParams) -> pl.Trainer:
+    job_id = os.environ.get("SLURM_JOB_ID")
+    checkpoints_dir, existing_checkpoint = get_existing_checkpoint(job_id)
+    if job_id is None:
+        print("SLURM job id is not found, running locally.")
+    if existing_checkpoint is None:
+        print("No existing checkpoint found, starting from scratch.")
+    if args.accelerator.override_float32_matmul:
+        torch.set_float32_matmul_precision(args.accelerator.float32_matmul)
+    with (checkpoints_dir/"params.json").open("w") as fp:
+        fp.write(args.json())
+    trainer_params = dict(
+        accelerator=args.accelerator.name,
+        devices=args.accelerator.devices,
+        precision=args.accelerator.precision,
+        max_epochs=args.epochs,
+        num_nodes=1,
+        num_sanity_val_steps=0,
+        enable_progress_bar=False,
+        callbacks=[
+            ConsoleLogger(),
+            ModelCheckpoint(
+                monitor="val_acc",
+                mode="max",
+                save_last=True,
+                save_top_k=1,
+                dirpath=checkpoints_dir,
+                filename="%s-{epoch:02d}-{val_acc:.4f}" % args.model_name,
+            ),
+        ],
+        resume_from_checkpoint=existing_checkpoint,
+    )
+    if job_id is not None:
+        trainer_params["plugins"] = SLURMEnvironment(requeue_signal=signal.SIGHUP),
+        trainer_params["strategy"] = args.accelerator.strategy
+    return pl.Trainer(**trainer_params)

realfake/train_cluster.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import warnings
+import pytorch_lightning as pl
+from realfake.config import SEED
+from realfake.models import RealFakeClassifier, RealFakeDataModule, RealFakeParams
+from realfake.train import prepare_trainer
+def main() -> None:
+    pl.seed_everything(SEED)
+    args = RealFakeParams.from_args()
+    model = RealFakeClassifier(args)
+    data = RealFakeDataModule(args.jsonl_file, args.batch_size, args.accelerator.devices * 4)
+    trainer = prepare_trainer(args)
+    if args.dry_run:
+        print("Dry run, skipping training.")
+        print("Model summary:")
+        print(model)
+        print("Data summary:")
+        data.setup()
+        print("Train batches:", len(data.dls[0]))
+        print("Valid batches:", len(data.dls[1]))
+    else:
+        trainer.fit(model, datamodule=data)
+if __name__ == "__main__":
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=Warning)
+        main()

realfake/utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from __future__ import annotations
+import argparse
+import datetime
+import json
+import os
+from operator import itemgetter
+from pathlib import Path
+from typing import Callable
+import requests
+import pynvml
+import PIL.Image
+import torch
+from pydantic import BaseSettings, BaseModel
+class Args(BaseSettings):
+    @classmethod
+    def from_args(cls):
+        parser = argparse.ArgumentParser()
+        for field in cls.__fields__.values():
+            if issubclass(field.type_, BaseModel):
+                prefix = field.type_.__name__.lower()
+                for subfield in field.type_.__fields__.values():
+                    short = "".join([x[0] for x in subfield.name.split("_")])
+                    parser.add_argument(f"--{prefix}.{subfield.name}", default=subfield.default, required=subfield.required)
+            else:
+                short = "".join([x[0] for x in field.name.split("_")])
+                parser.add_argument(f"-{short}", f"--{field.name}", default=field.default, required=field.required)
+        args = vars(parser.parse_known_args()[0])
+        to_delete = set()
+        for field in cls.__fields__.values():
+            if issubclass(field.type_, BaseModel):
+                prefix = field.type_.__name__.lower()
+                sub_args = {}
+                for k, v in args.items():
+                    if k.startswith(prefix):
+                        to_delete.add(k)
+                        sub_args[k.replace(f"{prefix}.", "")] = v
+                args[field.name] = sub_args
+        args = {k: v for k, v in args.items() if k not in to_delete}
+        return cls(**args)
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        env_prefix = "ARG_"
+def inject_args(func: Callable) -> Callable:
+    """Decorates a function to inject the arguments."""
+    injected = None
+    for type_ in func.__annotations__.values():
+        if issubclass(type_, Args):
+            injected = type_.from_args()
+            break
+    if injected is None:
+        raise ValueError(f"Function {func.__name__} is not annotated with an Args subclass.")
+    def wrapper(*args, **kwargs):
+        return func(injected, *args, **kwargs)
+    return wrapper
+def get_free_gpu() -> int:
+    pynvml.nvmlInit()
+    total = torch.cuda.device_count()
+    gpus = []
+    for i in range(total):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        gpus.append((i, info.free))
+    gpus = sorted(gpus, key=itemgetter(1), reverse=True)
+    return gpus[0][0]
+def get_user_name() -> str:
+    return Path(os.environ["HOME"]).stem
+def get_storage_dir() -> Path:
+    return Path(f"/fsx/{get_user_name()}")
+def get_checkpoints_dir(*, timestamp: bool) -> Path:
+    base_dir = get_storage_dir()/"checkpoints"
+    return Path(f"{base_dir}/{now()}") if timestamp else base_dir
+def now() -> str:
+    return datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+def read_jsonl(path: Path) -> list:
+    return [json.loads(x) for x in Path(path).read_text().split("\n") if x]
+def write_jsonl(path: Path, data: list):
+    with Path(path).open("w") as f:
+        for x in data:
+            f.write(json.dumps(x) + "\n")
+def get_image(url: str, filename: Path | None = None):
+    if filename is None: filename = Path(f"{url.split('/')[-1]}.jpg")
+    filename = Path(filename)
+    if filename.exists(): return filename
+    PIL.Image.open(requests.get(url, stream=True).raw).save(filename)
+    return filename
+def find_latest_checkpoint(dirname: Path) -> Path:
+    checkpoints = list(dirname.glob("*.ckpt"))
+    if not checkpoints:
+        return None
+    latest = max(checkpoints, key=lambda path: path.stat().st_mtime)
+    return latest

submit.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash -l
+# SLURM SUBMIT SCRIPT
+#SBATCH --partition=g40
+#SBATCH --nodes=1
+#SBATCH --gpus=8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --job-name=realfake
+#SBATCH --comment=laion
+#SBATCH --signal=SIGUSR1@90
+source "${HOME}/venv/bin/activate"
+export NCCL_DEBUG=INFO
+export PYTHONFAULTHANDLER=1
+export PYTHONPATH="${HOME}/realfake"
+echo "Working directory: `pwd`"
+srun python3 realfake/train_cluster.py \
+    -jf "${HOME}/realfake/metadata/prepared.2000k.jsonl" \
+    -mn convnext_large -e 5 -bs 128 \
+    --acceleratorparams.devices=8 \
+    --acceleratorparams.strategy=ddp_find_unused_parameters_false