devforfu commited on
Commit
8da91c9
1 Parent(s): c1f3687
Files changed (2) hide show
  1. nbs/movie.ipynb +172 -0
  2. nbs/movie_more.ipynb +204 -0
nbs/movie.ipynb ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "d6299e4c-f1ba-4be4-ac89-63c05287387c",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [
11
+ {
12
+ "name": "stdout",
13
+ "output_type": "stream",
14
+ "text": [
15
+ "/admin/home-devforfu/realfake\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "%cd .."
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 20,
26
+ "id": "8729a815-81b7-4667-bb97-c85456ad86e8",
27
+ "metadata": {
28
+ "tags": []
29
+ },
30
+ "outputs": [],
31
+ "source": [
32
+ "from pathlib import Path\n",
33
+ "import numpy as np\n",
34
+ "import pandas as pd\n",
35
+ "from sklearn.model_selection import train_test_split\n",
36
+ "from realfake.utils import list_files, write_jsonl"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 3,
42
+ "id": "82fa1fee-3ea0-415e-8a6a-8622bc55fabd",
43
+ "metadata": {
44
+ "tags": []
45
+ },
46
+ "outputs": [],
47
+ "source": [
48
+ "root = Path(\"/fsx/home-devforfu/data\")\n",
49
+ "laion = list_files(root/\"laionimages\", [\"jpg\"])\n",
50
+ "movie = list_files(root/\"shotcafe\", [\"jpg\"])"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 4,
56
+ "id": "58d79304-bdf0-445d-b5dd-46bd2a7bf1ec",
57
+ "metadata": {
58
+ "tags": []
59
+ },
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "text/plain": [
64
+ "(5133, 7526)"
65
+ ]
66
+ },
67
+ "execution_count": 4,
68
+ "metadata": {},
69
+ "output_type": "execute_result"
70
+ }
71
+ ],
72
+ "source": [
73
+ "len(laion), len(movie)"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 12,
79
+ "id": "f78ced08-3a5a-48f8-a635-6d93e82856c1",
80
+ "metadata": {
81
+ "tags": []
82
+ },
83
+ "outputs": [],
84
+ "source": [
85
+ "n_test = 0.1"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 19,
91
+ "id": "7a89dac3-cc8f-46cf-b60a-9118121ff69a",
92
+ "metadata": {
93
+ "tags": []
94
+ },
95
+ "outputs": [],
96
+ "source": [
97
+ "tst = set(train_test_split(np.arange(len(laion)), test_size=n_test)[1])\n",
98
+ "metadata = [{\"path\": str(fn), \"label\": \"real\", \"valid\": i in tst} for i, fn in enumerate(laion)]\n",
99
+ "tst = set(train_test_split(np.arange(len(movie)), test_size=n_test)[1])\n",
100
+ "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": i in tst} for i, fn in enumerate(movie)]"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 25,
106
+ "id": "b0da02f5-eec6-4114-81fa-901aef933d72",
107
+ "metadata": {
108
+ "tags": []
109
+ },
110
+ "outputs": [
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "False 11392\n",
116
+ "True 1267\n",
117
+ "Name: valid, dtype: int64\n",
118
+ "fake 7526\n",
119
+ "real 5133\n",
120
+ "Name: label, dtype: int64\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "df = pd.DataFrame(metadata)\n",
126
+ "print(df.valid.value_counts())\n",
127
+ "print(df.label.value_counts())"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 26,
133
+ "id": "bdad7b5a-17de-424b-8bfb-d13fe197552b",
134
+ "metadata": {
135
+ "tags": []
136
+ },
137
+ "outputs": [],
138
+ "source": [
139
+ "write_jsonl(\"metadata/movies.jsonl\", metadata)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "id": "428dda09-77b8-4f6c-b14e-059e9d281f9a",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": []
149
+ }
150
+ ],
151
+ "metadata": {
152
+ "kernelspec": {
153
+ "display_name": "Python 3 (ipykernel)",
154
+ "language": "python",
155
+ "name": "python3"
156
+ },
157
+ "language_info": {
158
+ "codemirror_mode": {
159
+ "name": "ipython",
160
+ "version": 3
161
+ },
162
+ "file_extension": ".py",
163
+ "mimetype": "text/x-python",
164
+ "name": "python",
165
+ "nbconvert_exporter": "python",
166
+ "pygments_lexer": "ipython3",
167
+ "version": "3.8.10"
168
+ }
169
+ },
170
+ "nbformat": 4,
171
+ "nbformat_minor": 5
172
+ }
nbs/movie_more.ipynb ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "df45e4ec-e732-4cf4-8017-ce3753f4cd48",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "%cd .."
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "id": "67eda6f2-e2e5-495a-8795-25365d46c081",
19
+ "metadata": {
20
+ "tags": []
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "import re\n",
25
+ "from pathlib import Path\n",
26
+ "import numpy as np\n",
27
+ "import pandas as pd\n",
28
+ "from sklearn.model_selection import train_test_split\n",
29
+ "from realfake.utils import list_files, write_jsonl"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "c6e57858-c5f1-4ef9-a69b-31e402215564",
36
+ "metadata": {
37
+ "tags": []
38
+ },
39
+ "outputs": [],
40
+ "source": [
41
+ "np.random.seed(1)"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "id": "5f8e03d2-039d-4aaf-b8bd-173d04b8d888",
48
+ "metadata": {
49
+ "tags": []
50
+ },
51
+ "outputs": [],
52
+ "source": [
53
+ "root = Path(\"/fsx/home-devforfu/data\")\n",
54
+ "laion, movie1, movie2 = [list_files(root/subdir, [\"jpg\"]) for subdir in (\"laionimages\", \"shotcafe\", \"pack1\")]"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "id": "1766851a-e1f6-46af-98a1-700aa53dd240",
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "len(movie2)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "id": "dcb3b8ab-6012-4e78-958b-aef1bc743ef9",
71
+ "metadata": {
72
+ "tags": []
73
+ },
74
+ "outputs": [],
75
+ "source": [
76
+ "n_test = 0.1"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": null,
82
+ "id": "419062b0-e594-4644-87f2-180ba864587e",
83
+ "metadata": {
84
+ "tags": []
85
+ },
86
+ "outputs": [],
87
+ "source": [
88
+ "tst = set(train_test_split(np.arange(len(laion)), test_size=n_test)[1])\n",
89
+ "metadata = [{\"path\": str(fn), \"label\": \"real\", \"valid\": i in tst} for i, fn in enumerate(laion)]\n",
90
+ "tst = set(train_test_split(np.arange(len(movie1)), test_size=n_test)[1])\n",
91
+ "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": i in tst} for i, fn in enumerate(movie1)]"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": null,
97
+ "id": "8f6ed004-8932-4ce9-b4cb-32d0c5c47da7",
98
+ "metadata": {
99
+ "tags": []
100
+ },
101
+ "outputs": [],
102
+ "source": [
103
+ "from collections import defaultdict\n",
104
+ "movie_to_frame = defaultdict(list)\n",
105
+ "for fn in movie2:\n",
106
+ " movie_name = re.search(\"((?:[a-zA-Z]+|[0-9]+))\", fn.stem).group(1)\n",
107
+ " movie_to_frame[movie_name].append(fn)"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "id": "bb06edea-419c-4e3f-b9dd-f4869e05d3a9",
114
+ "metadata": {
115
+ "tags": []
116
+ },
117
+ "outputs": [],
118
+ "source": [
119
+ "trn_keys, tst_keys = train_test_split(list(set(movie_to_frame)), test_size=n_test)"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "id": "6ac623e7-c9ae-4467-9fc7-f363adae6e57",
126
+ "metadata": {
127
+ "tags": []
128
+ },
129
+ "outputs": [],
130
+ "source": [
131
+ "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": False} for key in trn_keys for fn in movie_to_frame[key]]\n",
132
+ "metadata += [{\"path\": str(fn), \"label\": \"fake\", \"valid\": True} for key in tst_keys for fn in movie_to_frame[key]]"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": null,
138
+ "id": "a70b6b30-fdbb-49cb-b10f-34547c4072bb",
139
+ "metadata": {
140
+ "tags": []
141
+ },
142
+ "outputs": [],
143
+ "source": [
144
+ "df = pd.DataFrame(metadata)\n",
145
+ "print(df.valid.value_counts())\n",
146
+ "print(df.label.value_counts())"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": null,
152
+ "id": "737a3a2f-428d-490f-a8a9-def76f094ce0",
153
+ "metadata": {
154
+ "tags": []
155
+ },
156
+ "outputs": [],
157
+ "source": [
158
+ "pos_weight = (1 - df.label.value_counts(normalize=True)).tolist()\n",
159
+ "pos_weight"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "id": "72e03454-cec8-4b58-ba3a-f1fe7d59ee8e",
166
+ "metadata": {
167
+ "tags": []
168
+ },
169
+ "outputs": [],
170
+ "source": [
171
+ "write_jsonl(\"metadata/movies_plus.jsonl\", metadata)"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "id": "41c061b1-7f8d-43d5-9b52-88c6144d0bda",
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": []
181
+ }
182
+ ],
183
+ "metadata": {
184
+ "kernelspec": {
185
+ "display_name": "Python 3 (ipykernel)",
186
+ "language": "python",
187
+ "name": "python3"
188
+ },
189
+ "language_info": {
190
+ "codemirror_mode": {
191
+ "name": "ipython",
192
+ "version": 3
193
+ },
194
+ "file_extension": ".py",
195
+ "mimetype": "text/x-python",
196
+ "name": "python",
197
+ "nbconvert_exporter": "python",
198
+ "pygments_lexer": "ipython3",
199
+ "version": "3.8.10"
200
+ }
201
+ },
202
+ "nbformat": 4,
203
+ "nbformat_minor": 5
204
+ }