Samuel Mueller commited on
Commit
e487255
0 Parent(s):
Files changed (47) hide show
  1. .gitattributes +33 -0
  2. .gitmodules +0 -0
  3. README.md +12 -0
  4. TabPFN/PrepareDatasets.ipynb +373 -0
  5. TabPFN/README.md +23 -0
  6. TabPFN/SyntheticGPAblation.ipynb +392 -0
  7. TabPFN/TabPFNPredictionOnly.ipynb +253 -0
  8. TabPFN/TabularEvaluationVisualization.ipynb +0 -0
  9. TabPFN/TrainingTuningAndPrediction.ipynb +0 -0
  10. TabPFN/datasets/__init__.py +149 -0
  11. TabPFN/datasets/utils.py +8 -0
  12. TabPFN/decoders.py +30 -0
  13. TabPFN/differentiable_pfn_evaluation.py +345 -0
  14. TabPFN/encoders.py +225 -0
  15. TabPFN/initializers.py +9 -0
  16. TabPFN/layer.py +125 -0
  17. TabPFN/losses.py +41 -0
  18. TabPFN/model_builder.py +273 -0
  19. TabPFN/models_diff/gp_ablation_model.cpkt +3 -0
  20. TabPFN/models_diff/prior_diff_real_checkpoint_n_8x_lr0.0003_epoch_49.cpkt +3 -0
  21. TabPFN/notebook_utils.py +32 -0
  22. TabPFN/positional_encodings.py +70 -0
  23. TabPFN/prior_tuning_result.pkl +3 -0
  24. TabPFN/priors/__init__.py +4 -0
  25. TabPFN/priors/differentiable_prior.py +293 -0
  26. TabPFN/priors/fast_gp.py +144 -0
  27. TabPFN/priors/flexible_categorical.py +240 -0
  28. TabPFN/priors/mlp.py +173 -0
  29. TabPFN/priors/prior.py +12 -0
  30. TabPFN/priors/prior_bag.py +32 -0
  31. TabPFN/priors/utils.py +163 -0
  32. TabPFN/requirements.txt +15 -0
  33. TabPFN/scripts/baseline_prediction_interface.py +38 -0
  34. TabPFN/scripts/differentiable_pfn_evaluation.py +391 -0
  35. TabPFN/scripts/model_configs.py +210 -0
  36. TabPFN/scripts/tabular_baselines.py +421 -0
  37. TabPFN/scripts/tabular_evaluation.py +284 -0
  38. TabPFN/scripts/tabular_metrics.py +181 -0
  39. TabPFN/scripts/transformer_prediction_interface.py +357 -0
  40. TabPFN/tabular_evaluation.py +283 -0
  41. TabPFN/train.py +386 -0
  42. TabPFN/transformer.py +226 -0
  43. TabPFN/utils.py +236 -0
  44. app.py +96 -0
  45. balance-scale.arff +694 -0
  46. iris.csv +151 -0
  47. requirements.txt +16 -0
.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
33
+ *.cpkt filter=lfs diff=lfs merge=lfs -text
.gitmodules ADDED
File without changes
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TabPFN
3
+ emoji: 🐨
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.1.1
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
TabPFN/PrepareDatasets.ipynb ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import numpy as np\n",
10
+ "\n",
11
+ "import openml\n",
12
+ "import pandas as pd"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 2,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "from tqdm import tqdm\n",
22
+ "\n",
23
+ "from datasets import load_openml_list, test_dids_classification, valid_large_classification, open_cc_dids, open_cc_valid_dids\n"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 6,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stdout",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "The autoreload extension is already loaded. To reload it, use:\n",
36
+ " %reload_ext autoreload\n"
37
+ ]
38
+ }
39
+ ],
40
+ "source": [
41
+ "%load_ext autoreload\n",
42
+ "\n",
43
+ "%autoreload 2"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "metadata": {
49
+ "tags": []
50
+ },
51
+ "source": [
52
+ "### Prepare test datasets"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 7,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "renamer = {'name': 'Name', 'NumberOfFeatures': '# Features', 'NumberOfSymbolicFeatures': '# Categorical Features', 'NumberOfInstances': '# Instances', 'NumberOfMissingValues': '# NaNs', 'NumberOfClasses': '# Classes', 'MinorityClassSize': 'Minority Class Size'}\n"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 8,
67
+ "metadata": {},
68
+ "outputs": [
69
+ {
70
+ "data": {
71
+ "text/plain": [
72
+ "OrderedDict([(99,\n",
73
+ " {'id': 99,\n",
74
+ " 'alias': 'OpenML-CC18',\n",
75
+ " 'main_entity_type': 'task',\n",
76
+ " 'name': 'OpenML-CC18 Curated Classification benchmark',\n",
77
+ " 'status': 'active',\n",
78
+ " 'creation_date': '2019-02-21 18:47:13',\n",
79
+ " 'creator': 1}),\n",
80
+ " (225,\n",
81
+ " {'id': 225,\n",
82
+ " 'alias': 'OpenML-friendly',\n",
83
+ " 'main_entity_type': 'task',\n",
84
+ " 'name': 'OpenML100-friendly',\n",
85
+ " 'status': 'active',\n",
86
+ " 'creation_date': '2019-09-16 19:41:46',\n",
87
+ " 'creator': 1})])"
88
+ ]
89
+ },
90
+ "execution_count": 8,
91
+ "metadata": {},
92
+ "output_type": "execute_result"
93
+ }
94
+ ],
95
+ "source": [
96
+ "openml.study.list_suites()"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 9,
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "suite = openml.study.get_suite(suite_id=99)\n",
106
+ "tasks = openml.tasks.list_tasks(output_format=\"dataframe\")"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 10,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "# Using ``@`` in `pd.DataFrame.query <\n",
116
+ "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_\n",
117
+ "# accesses variables outside of the current dataframe.\n",
118
+ "tasks = tasks.query(\"tid in @suite.tasks\")"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 11,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "tids = list(tasks[np.logical_and(np.logical_and((tasks.NumberOfInstances <= 2000), (tasks.NumberOfFeatures <= 100))\n",
128
+ " , (tasks.NumberOfClasses <= 10))].tid)"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 12,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "data": {
138
+ "text/plain": [
139
+ "30"
140
+ ]
141
+ },
142
+ "execution_count": 12,
143
+ "metadata": {},
144
+ "output_type": "execute_result"
145
+ }
146
+ ],
147
+ "source": [
148
+ "len(tids)"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 13,
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "tids = list(tasks[tasks.NumberOfInstances <= 2000].tid)"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 14,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "open_cc_dids = [openml.tasks.get_task(task_id).get_dataset().id for task_id in tids]"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "outputs": [],
173
+ "source": [
174
+ "open_ml_datasets, open_ml_datasets_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 100000, num_feats=100, return_capped=True)\n"
175
+ ],
176
+ "metadata": {
177
+ "collapsed": false,
178
+ "pycharm": {
179
+ "name": "#%%\n"
180
+ }
181
+ }
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": 16,
186
+ "metadata": {},
187
+ "outputs": [],
188
+ "source": [
189
+ "open_ml_datasets_df = open_ml_datasets_df[open_ml_datasets_df.NumberOfInstances > 10000]"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 17,
195
+ "metadata": {},
196
+ "outputs": [
197
+ {
198
+ "name": "stdout",
199
+ "output_type": "stream",
200
+ "text": [
201
+ "\\begin{tabular}{lrrrrrrr}\n",
202
+ "\\toprule\n",
203
+ " Name & \\# Features & \\# Categorical Features & \\# Instances & \\# Classes & \\# NaNs & Minority Class Size & id \\\\\n",
204
+ "\\midrule\n",
205
+ " KDDCup09\\_appetency & 231 & 39 & 50000 & 2 & 8024152 & 890 & 1111 \\\\\n",
206
+ " airlines & 8 & 5 & 539383 & 2 & 0 & 240264 & 1169 \\\\\n",
207
+ " bank-marketing & 17 & 10 & 45211 & 2 & 0 & 5289 & 1461 \\\\\n",
208
+ " nomao & 119 & 30 & 34465 & 2 & 0 & 9844 & 1486 \\\\\n",
209
+ " adult & 15 & 9 & 48842 & 2 & 6465 & 11687 & 1590 \\\\\n",
210
+ " covertype & 55 & 45 & 581012 & 7 & 0 & 2747 & 1596 \\\\\n",
211
+ " numerai28.6 & 22 & 1 & 96320 & 2 & 0 & 47662 & 23517 \\\\\n",
212
+ " connect-4 & 43 & 43 & 67557 & 3 & 0 & 6449 & 40668 \\\\\n",
213
+ "jungle\\_chess\\_2pcs\\_raw\\_endgame\\_complete & 7 & 1 & 44819 & 3 & 0 & 4335 & 41027 \\\\\n",
214
+ " APSFailure & 171 & 1 & 76000 & 2 & 1078695 & 1375 & 41138 \\\\\n",
215
+ " albert & 79 & 53 & 425240 & 2 & 2734000 & 212620 & 41147 \\\\\n",
216
+ " MiniBooNE & 51 & 1 & 130064 & 2 & 0 & 36499 & 41150 \\\\\n",
217
+ " guillermo & 4297 & 1 & 20000 & 2 & 0 & 8003 & 41159 \\\\\n",
218
+ " riccardo & 4297 & 1 & 20000 & 2 & 0 & 5000 & 41161 \\\\\n",
219
+ " volkert & 181 & 1 & 58310 & 10 & 0 & 1361 & 41166 \\\\\n",
220
+ " dionis & 61 & 1 & 416188 & 355 & 0 & 878 & 41167 \\\\\n",
221
+ " jannis & 55 & 1 & 83733 & 4 & 0 & 1687 & 41168 \\\\\n",
222
+ " helena & 28 & 1 & 65196 & 100 & 0 & 111 & 41169 \\\\\n",
223
+ "\\bottomrule\n",
224
+ "\\end{tabular}\n",
225
+ "\n"
226
+ ]
227
+ }
228
+ ],
229
+ "source": [
230
+ "print_table = open_ml_datasets_df\n",
231
+ "print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()\n",
232
+ "print_table['id'] = print_table.index\n",
233
+ "print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)\n",
234
+ "print_table = print_table.rename(columns=renamer)\n",
235
+ "print(print_table.to_latex(index=False))"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "markdown",
240
+ "metadata": {
241
+ "tags": []
242
+ },
243
+ "source": [
244
+ "### Prepare Validation datasets"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": null,
250
+ "outputs": [],
251
+ "source": [
252
+ "open_cc_datasets, open_cc_datasets_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 2000, num_feats=100, return_capped=True)\n",
253
+ "\n",
254
+ "def extend_datasets(datasets, filtering = False):\n",
255
+ " extended_datasets = {}\n",
256
+ " i = 0\n",
257
+ " for d in tqdm(datasets):\n",
258
+ " if ((not 'NumberOfFeatures' in datasets[d])\n",
259
+ " or (not 'NumberOfClasses' in datasets[d])\n",
260
+ " or (not 'NumberOfInstances' in datasets[d])\n",
261
+ " # or datasets[d]['NumberOfFeatures'] >= num_feats\n",
262
+ " or datasets[d]['NumberOfClasses'] <= 0):\n",
263
+ " print(datasets[d])\n",
264
+ " continue\n",
265
+ " ds = openml.datasets.get_dataset(d, download_data=False)\n",
266
+ " if filtering and (datasets[d]['NumberOfInstances'] < 150\n",
267
+ " or datasets[d]['NumberOfInstances'] > 2000\n",
268
+ " or datasets[d]['NumberOfFeatures'] > 100\n",
269
+ " or datasets[d]['NumberOfClasses'] > 10):\n",
270
+ " continue\n",
271
+ " extended_datasets[d] = datasets[d]\n",
272
+ " extended_datasets[d].update(ds.qualities)\n",
273
+ " \n",
274
+ " return extended_datasets\n",
275
+ "\n",
276
+ "# All datasets\n",
277
+ "openml_list = openml.datasets.list_datasets()\n",
278
+ "openml_list = pd.DataFrame.from_dict(openml_list, orient=\"index\")\n",
279
+ "\n",
280
+ "# Select only classification\n",
281
+ "openml_list = openml_list[~openml_list['MajorityClassSize'].isna()]\n",
282
+ "\n",
283
+ "# Remove duplicated datasets\n",
284
+ "duplicated = openml_list.duplicated(subset=['MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize',\n",
285
+ " 'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances',\n",
286
+ " 'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',\n",
287
+ " 'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures'], keep='first')\n",
288
+ "openml_list = openml_list[~duplicated]\n",
289
+ "\n",
290
+ "duplicated = openml_list.duplicated(subset=['name'], keep='first')\n",
291
+ "openml_list = openml_list[~duplicated]\n",
292
+ "\n",
293
+ "# Filter out datasets that don't have meta information or Don't fulfill other criteria\n",
294
+ "openml_list = openml_list.to_dict(orient='index')\n",
295
+ "openml_list = pd.DataFrame.from_dict(extend_datasets(openml_list, filtering=True), orient=\"index\")\n",
296
+ "\n",
297
+ "# Filter out datasets in Open CC\n",
298
+ "openml_list = openml_list[~openml_list.name.apply(lambda x: x in test_datasets_multiclass_df.name.values)]\n",
299
+ "openml_list['CFI'] = openml_list.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)\n",
300
+ "test_datasets_multiclass_df['CFI'] = test_datasets_multiclass_df.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)\n",
301
+ "openml_list = openml_list[~openml_list.CFI.apply(lambda x: x in test_datasets_multiclass_df.CFI.values)]\n",
302
+ "\n",
303
+ "# Remove time series and artificial data\n",
304
+ "openml_list = openml_list[~openml_list.name.apply(lambda x: 'autoUniv' in x)]\n",
305
+ "openml_list = openml_list[~openml_list.name.apply(lambda x: 'fri_' in x)]\n",
306
+ "openml_list = openml_list[~openml_list.name.apply(lambda x: 'FOREX' in x)]\n",
307
+ "\n",
308
+ "# Remove datasets that overlapped with Open CC closely by name\n",
309
+ "openml_list = openml_list[~openml_list.name.apply(lambda x: 'ilpd' in x)]\n",
310
+ "openml_list = openml_list[~openml_list.name.apply(lambda x: 'car' in x)]\n",
311
+ "openml_list = openml_list[~openml_list.name.apply(lambda x: 'pc1' in x)]\n",
312
+ "\n",
313
+ "# Remove datasets that didn't load\n",
314
+ "openml_list = openml_list[~openml_list.did.apply(lambda x: x in {1065, 40589, 41496, 770, 43097, 43148, 43255, 43595, 43786, 41701})]\n",
315
+ "\n",
316
+ "# Remove class skew\n",
317
+ "openml_list = openml_list[(openml_list.MinorityClassSize / openml_list.MajorityClassSize) > 0.05]\n",
318
+ "openml_list = openml_list[openml_list.AutoCorrelation != 1]\n",
319
+ "\n",
320
+ "# Remove too easy\n",
321
+ "openml_list = openml_list[openml_list.CfsSubsetEval_DecisionStumpAUC != 1]"
322
+ ],
323
+ "metadata": {
324
+ "collapsed": false,
325
+ "pycharm": {
326
+ "name": "#%%\n"
327
+ }
328
+ }
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": null,
333
+ "metadata": {},
334
+ "outputs": [],
335
+ "source": [
336
+ "print_table = openml_list\n",
337
+ "print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()\n",
338
+ "print_table['id'] = print_table.index\n",
339
+ "print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)\n",
340
+ "print_table = print_table.rename(columns=renamer)\n",
341
+ "print(print_table.to_latex(index=False))"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": null,
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": []
350
+ }
351
+ ],
352
+ "metadata": {
353
+ "kernelspec": {
354
+ "display_name": "Python 3 (ipykernel)",
355
+ "language": "python",
356
+ "name": "python3"
357
+ },
358
+ "language_info": {
359
+ "codemirror_mode": {
360
+ "name": "ipython",
361
+ "version": 3
362
+ },
363
+ "file_extension": ".py",
364
+ "mimetype": "text/x-python",
365
+ "name": "python",
366
+ "nbconvert_exporter": "python",
367
+ "pygments_lexer": "ipython3",
368
+ "version": "3.7.13"
369
+ }
370
+ },
371
+ "nbformat": 4,
372
+ "nbformat_minor": 4
373
+ }
TabPFN/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TabPFN
2
+
3
+ ## Installation
4
+ ```
5
+ git clone git@github.com:automl/TabPFN.git
6
+ cd TabPFN
7
+ conda create -n TabPFN python=3.7
8
+ conda activate TabPFN
9
+ pip install -r requirements.txt
10
+ ```
11
+
12
+ To run the autogluon baseline please create a separate environment and install autogluon==0.4.0, installation in the same environment as our other baselines is not possible.
13
+
14
+ ## Usage
15
+ TrainingTuningAndPrediction: Train a TabPFN, Prior Tune and predict using a pretrained model.
16
+
17
+ TabularEvaluationVisualization: Run Baselines and load Baseline and TabPFN Results for comparison and plotting.
18
+
19
+ PrepareDatasets: Notebook used to inspect Datasets (Not needed to run baselines / TabPFN).
20
+
21
+ SytheticGPAblation: Ablation experiments for Gaussian Process fitting with differentiable Hyper Parameters.
22
+
23
+
TabPFN/SyntheticGPAblation.ipynb ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "\n",
11
+ "%autoreload 2"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import os\n",
21
+ "import time\n",
22
+ "\n",
23
+ "import torch\n",
24
+ "\n",
25
+ "import numpy as np\n",
26
+ "\n",
27
+ "import matplotlib.pyplot as plt\n",
28
+ "\n",
29
+ "from model_builder import get_model, get_default_spec, save_model, load_model\n",
30
+ "\n",
31
+ "from scripts.model_configs import *"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "metadata": {
37
+ "tags": []
38
+ },
39
+ "source": [
40
+ "# Setting params"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 6,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "device = 'cuda'\n",
50
+ "base_path = os.path.join('.')"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 7,
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "def train_function(config_sample, i, add_name=''):\n",
60
+ " start_time = time.time()\n",
61
+ " N_epochs_to_save = 50\n",
62
+ " \n",
63
+ " def save_callback(model, epoch):\n",
64
+ " if not hasattr(model, 'last_saved_epoch'):\n",
65
+ " model.last_saved_epoch = 0\n",
66
+ " if ((time.time() - start_time) / (maximum_runtime * 60 / N_epochs_to_save)) > model.last_saved_epoch:\n",
67
+ " print('Saving model..')\n",
68
+ " config_sample['epoch_in_training'] = epoch\n",
69
+ " save_model(model, base_path, f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{model.last_saved_epoch}.cpkt',\n",
70
+ " config_sample)\n",
71
+ " model.last_saved_epoch = model.last_saved_epoch + 1 # TODO: Rename to checkpoint\n",
72
+ " \n",
73
+ " model = get_model(config_sample\n",
74
+ " , device\n",
75
+ " , should_train=True\n",
76
+ " , verbose=1\n",
77
+ " , epoch_callback = save_callback)\n",
78
+ " \n",
79
+ " return"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "markdown",
84
+ "metadata": {
85
+ "heading_collapsed": true,
86
+ "tags": []
87
+ },
88
+ "source": [
89
+ "# Check synthetic data fitting"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {
95
+ "tags": []
96
+ },
97
+ "source": [
98
+ "#### Workflow functions"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 8,
104
+ "metadata": {
105
+ "hidden": true,
106
+ "tags": []
107
+ },
108
+ "outputs": [],
109
+ "source": [
110
+ "def generate_test_data(test_gp_params):\n",
111
+ " # Generate test data\n",
112
+ " config = {**test_gp_params}\n",
113
+ "\n",
114
+ " config['verbose'] = False\n",
115
+ " config['differentiable'] = False\n",
116
+ " #config['bptt'] = config['bptt_in_training']\n",
117
+ "\n",
118
+ " model_test_data = get_model(config, device, should_train=False, verbose=True)\n",
119
+ " (hp_embedding, data, targets_), targets = next(iter(model_test_data[3]))\n",
120
+ " (hp_embedding, data, targets_), targets = (hp_embedding, data.to(device), targets_.to(device)), targets.to(device)\n",
121
+ " \n",
122
+ " return (hp_embedding, data, targets_), targets\n",
123
+ "\n",
124
+ "def evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size):\n",
125
+ " losses, hparams = [], []\n",
126
+ " for l in np.arange(-1.74, 1.74, plot_step_size):\n",
127
+ " hparam = [*hparam_true]\n",
128
+ " hparam[vary_hparam_ind] = l\n",
129
+ " hp_embedding_used = torch.tensor(hparam).to(device).float()\n",
130
+ " with torch.inference_mode():\n",
131
+ " outputs = torch.sigmoid(model[2]((hp_embedding_used.repeat(data.shape[1], 1), data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)\n",
132
+ " \n",
133
+ " loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten()).detach().cpu()\n",
134
+ " losses += [loss]\n",
135
+ " hparam_real = [diff_hparams_f[i][1](hp) for i, hp in enumerate(hparam)]\n",
136
+ " hparams += [hparam_real]\n",
137
+ " \n",
138
+ " print(loss, hparam_real, hparam, outputs.shape)\n",
139
+ " return np.array(losses), np.array(hparams)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 9,
145
+ "metadata": {},
146
+ "outputs": [],
147
+ "source": [
148
+ "def differentiable_hparam_tuning_workflow(config_sample, hparam_label, batch_size=4, N_grad_steps=50, plot_step_size=0.1):\n",
149
+ " test_gp_params = {\n",
150
+ " \"lengthscale\": 1.0,\n",
151
+ " #\"lengthscale_mean\": true_lengthscale,\n",
152
+ " #\"lengthscale_std\": 0.5,\n",
153
+ " \"noise\": 0.2,\n",
154
+ " \"outputscale\": 1.0,\n",
155
+ " 'batch_size': batch_size\n",
156
+ " }\n",
157
+ " config_sample.update(test_gp_params)\n",
158
+ " (hp_embedding, data, targets_), targets = generate_test_data(config_sample)\n",
159
+ " hparam_true = [diff_hparams_f[i][0](test_gp_params[hp]) for i, hp in enumerate(diff_hparams_keys)]\n",
160
+ " #hparam_true = [test_gp_params[hp] for i, hp in enumerate(diff_hparams_keys)]\n",
161
+ "\n",
162
+ " for vary_hparam_ind, vary_hparam_name in hparam_label:\n",
163
+ " print(vary_hparam_name)\n",
164
+ "\n",
165
+ " losses, hparams = evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size=plot_step_size)\n",
166
+ "\n",
167
+ " # TODO: Make only one parameter diffable\n",
168
+ " hparam = torch.tensor([*hparam_true]).to(device).float()\n",
169
+ " hparam[vary_hparam_ind] = hparam[vary_hparam_ind] + 0.1 #random.random() * 2 - 1\n",
170
+ " hparam = torch.nn.Parameter(hparam, requires_grad=True)\n",
171
+ " hparam_grad_mask = torch.zeros_like(hparam)\n",
172
+ " hparam_grad_mask[vary_hparam_ind] = 1\n",
173
+ "\n",
174
+ " optimizer = torch.optim.Adam([hparam], lr=0.1)\n",
175
+ " \n",
176
+ " for t in range(N_grad_steps):\n",
177
+ " style = hparam.repeat(data.shape[1], 1)\n",
178
+ " outputs = torch.sigmoid(model[2]((style, data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)\n",
179
+ " loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten())\n",
180
+ " optimizer.zero_grad()\n",
181
+ " loss.backward()\n",
182
+ " with torch.no_grad():\n",
183
+ " hparam.grad *= hparam_grad_mask\n",
184
+ " optimizer.step()\n",
185
+ " print('loss:', loss, 'hparams', diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind]), 'true', diff_hparams_f[vary_hparam_ind][1](hparam_true[vary_hparam_ind]))\n",
186
+ " inferred_param = diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind].cpu().detach().numpy())\n",
187
+ " return hparams, losses, inferred_param, vary_hparam_ind, hparam_true\n",
188
+ " "
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "markdown",
193
+ "metadata": {
194
+ "tags": []
195
+ },
196
+ "source": [
197
+ "#### Fitting a PFN with HP-Diffable GP Prior"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 10,
203
+ "metadata": {
204
+ "hidden": true,
205
+ "tags": []
206
+ },
207
+ "outputs": [],
208
+ "source": [
209
+ "num_features = 5\n",
210
+ "bptt = 200\n",
211
+ "eval_positions = [100]\n",
212
+ "\n",
213
+ "config_general = get_general_config(num_features, bptt, eval_positions)\n",
214
+ "config_flexible_categorical = get_flexible_categorical_config(num_features)\n",
215
+ "\n",
216
+ "config_gp = {'noise': 0.2, \"lengthscale\": 1.0, \"outputscale\": 1.0}\n",
217
+ "config_diff_gp = {'differentiable_hyperparameters': {\n",
218
+ " 'outputscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},\n",
219
+ " 'lengthscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},\n",
220
+ " 'noise': {'distribution': 'uniform', 'min': 0.0000001, 'max': 0.5},\n",
221
+ " }\n",
222
+ "}\n",
223
+ "\n",
224
+ "config = {**config_general, **config_flexible_categorical, **config_diff_gp, **config_gp}\n",
225
+ "\n",
226
+ "config['prior_type'], config['differentiable'], config['flexible'] = 'gp', True, True\n",
227
+ "config['num_features'], config['num_features_used'] = num_features, num_features\n",
228
+ "config['epochs'], config['num_steps'], config['verbose'] = 500, 100, False\n",
229
+ "config[\"lr\"] = 0.00001\n",
230
+ "config[\"dropout\"] = 0\n",
231
+ "config[\"emsize\"] = 512\n",
232
+ "config[\"batch_size\"] = 128\n",
233
+ "config[\"aggregate_k_gradients\"] = 1\n",
234
+ "config['set_value_to_nan'] = 0.0\n",
235
+ "config['output_multiclass_ordered_p'] = 1.0\n",
236
+ "config['categorical_feature_p'] = 0.0\n",
237
+ "config['nan_prob_a_reason'] = 0.0\n",
238
+ "config['nan_prob_no_reason'] = 0.0\n",
239
+ "config['nan_prob_unknown_reason'] = 0.0\n",
240
+ "config[\"nlayers\"] = 8\n",
241
+ "\n",
242
+ "# TODO: This should not be sampled, but be one config\n",
243
+ "# TODO: This uses old hyperparam sampler throws error\n",
244
+ "config_sample = evaluate_hypers(config)"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 11,
250
+ "metadata": {
251
+ "hidden": true,
252
+ "tags": []
253
+ },
254
+ "outputs": [
255
+ {
256
+ "name": "stdout",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "Using style prior: True\n",
260
+ "Using cpu:0 device\n",
261
+ "Not using distributed\n",
262
+ "DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 128, 'seq_len': 200, 'seq_len_maximum': 200, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 128, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 200, 'eval_positions': None, 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': 5, 'differentiable_hyperparameters': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': 0.2, 'lengthscale': 1.0, 'outputscale': 1.0, 'prior_type': 'gp', 'differentiable': True, 'flexible': True, 'aggregate_k_gradients': 1, 'output_multiclass_ordered_p': 1.0, 'recompute_attn': False}, 'num_outputs': 1, 'dynamic_batch_size': 2, 'get_batch': <function get_model.<locals>.make_get_batch.<locals>.<lambda> at 0x7f39ad8dcf80>, 'differentiable_hyperparameters': {'outputscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'lengthscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': {'distribution': 'uniform', 'min': 1e-07, 'max': 0.5}}}, 'num_features': 5, 'num_outputs': 1}\n",
263
+ "Using a Transformer with 17.35 M parameters\n"
264
+ ]
265
+ }
266
+ ],
267
+ "source": [
268
+ "device = 'cuda'\n",
269
+ "train_function(config_sample, 0, add_name='gp_experiments_diff_with_noise_no_meta_new')"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "markdown",
274
+ "metadata": {
275
+ "tags": []
276
+ },
277
+ "source": [
278
+ "#### Evaluating a PFN (with pretrained model)"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 13,
284
+ "metadata": {
285
+ "hidden": true,
286
+ "tags": []
287
+ },
288
+ "outputs": [
289
+ {
290
+ "name": "stdout",
291
+ "output_type": "stream",
292
+ "text": [
293
+ "Using style prior: True\n",
294
+ "Using cpu:0 device\n",
295
+ "Not using distributed\n",
296
+ "DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 1, 'seq_len': 10, 'seq_len_maximum': 10, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 1, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 10, 'eval_positions': [190], 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'output_multiclass_ordered_p': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'multiclass_type': 'rank', 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': <function load_model.<locals>.<lambda> at 0x7f39ad8534d0>, 'differentiable_hyperparameters': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': 0.03, 'lengthscale': 1.0, 'outputscale': 1.0, 'prior_type': 'gp', 'differentiable': True, 'flexible': True, 'aggregate_k_gradients': 1, 'recompute_attn': False, 'bptt_extra_samples': None, 'epoch_in_training': 0.998, 'categorical_features_sampler': <function load_model.<locals>.<lambda> at 0x7f39ad853680>, 'num_features_used_in_training': 5, 'num_classes_in_training': 2, 'batch_size_in_training': 128, 'bptt_in_training': 200, 'bptt_extra_samples_in_training': None}, 'num_outputs': 1, 'dynamic_batch_size': 2, 'get_batch': <function get_model.<locals>.make_get_batch.<locals>.<lambda> at 0x7f39ad81ab90>, 'differentiable_hyperparameters': {'outputscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'lengthscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': {'distribution': 'uniform', 'min': 1e-07, 'max': 0.5}}}, 'num_features': 5, 'num_outputs': 1}\n",
297
+ "Using a Transformer with 17.35 M parameters\n"
298
+ ]
299
+ }
300
+ ],
301
+ "source": [
302
+ "device = 'cpu'\n",
303
+ "model, c = load_model(base_path, f'models_diff/gp_ablation_model.cpkt', device, eval_positions, verbose=False)"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 14,
309
+ "metadata": {},
310
+ "outputs": [],
311
+ "source": [
312
+ "from priors.differentiable_prior import DifferentiableHyperparameterList\n",
313
+ "diff_list = DifferentiableHyperparameterList(c['differentiable_hyperparameters'], 512, device)\n",
314
+ "diff_hparams_keys, diff_hparams_f = diff_list.get_hyperparameter_info()"
315
+ ]
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": null,
320
+ "metadata": {
321
+ "tags": []
322
+ },
323
+ "outputs": [],
324
+ "source": [
325
+ "model[2].eval()\n",
326
+ "eval_pos = 100\n",
327
+ "\n",
328
+ "hparam_label = [(1, 'outputscale')]\n",
329
+ "hparam_label = [(0, 'lengthscale')]\n",
330
+ "hparam_label = [(2, 'noise')]\n",
331
+ "hparam_labels = [[(1, 'outputscale')], [(2, 'noise')], [(0, 'lengthscale')]]\n",
332
+ "#hparam_labels = [[(2, 'noise')]]\n",
333
+ "\n",
334
+ "hparams, losses, inferred_param, vary_hparam_ind, hparam_true = {}, {}, {}, {}, {}\n",
335
+ "\n",
336
+ "for hparam_label in hparam_labels:\n",
337
+ " (hparams[hparam_label[0][1]], losses[hparam_label[0][1]], inferred_param[hparam_label[0][1]], vary_hparam_ind[hparam_label[0][1]], \n",
338
+ " hparam_true[hparam_label[0][1]]) = differentiable_hparam_tuning_workflow(config_sample, \n",
339
+ " hparam_label=hparam_label, \n",
340
+ " batch_size=256, \n",
341
+ " N_grad_steps=50,\n",
342
+ " plot_step_size=0.05)\n"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": null,
348
+ "metadata": {},
349
+ "outputs": [],
350
+ "source": [
351
+ "label = 'lengthscale'\n",
352
+ "\n",
353
+ "#import tikzplotlib\n",
354
+ "\n",
355
+ "inferred = losses[label]\n",
356
+ "\n",
357
+ "plt.plot(hparams[label][:, vary_hparam_ind[label]], losses[label])\n",
358
+ "true = diff_hparams_f[vary_hparam_ind[label]][1](hparam_true[label][vary_hparam_ind[label]])\n",
359
+ "plt.axvline(x=inferred_param[label], linestyle='solid', color='red')\n",
360
+ "plt.axvline(x=true, linestyle='dashed')\n",
361
+ "\n",
362
+ "plt.ylabel('Cross entropy Loss')\n",
363
+ "plt.xlabel(label)\n",
364
+ "\n",
365
+ "#tikzplotlib.save(f'diff_inferred_params_{label}.tex', axis_height='5.2cm', axis_width='5.2cm', strict=True)\n",
366
+ "\n",
367
+ "plt.show()"
368
+ ]
369
+ }
370
+ ],
371
+ "metadata": {
372
+ "kernelspec": {
373
+ "display_name": "Python 3 (ipykernel)",
374
+ "language": "python",
375
+ "name": "python3"
376
+ },
377
+ "language_info": {
378
+ "codemirror_mode": {
379
+ "name": "ipython",
380
+ "version": 3
381
+ },
382
+ "file_extension": ".py",
383
+ "mimetype": "text/x-python",
384
+ "name": "python",
385
+ "nbconvert_exporter": "python",
386
+ "pygments_lexer": "ipython3",
387
+ "version": "3.7.13"
388
+ }
389
+ },
390
+ "nbformat": 4,
391
+ "nbformat_minor": 4
392
+ }
TabPFN/TabPFNPredictionOnly.ipynb ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "This notebook shows how to use TabPFN for tabular prediction with a scikit learn wrapper.\n",
8
+ "\n",
9
+ "classifier = TabPFNClassifier(device='cpu')\n",
10
+ "classifier.fit(train_xs, train_ys)\n",
11
+ "prediction_ = classifier.predict(test_xs)\n",
12
+ "\n",
13
+ "The fit function does not perform any computations, but only saves the training data. Computations are only done at inference time, when calling predict.\n",
14
+ "Note that the presaved models were trained for up to 100 features, 10 classes and 1000 samples. While the model does not have a hard bound on the number of samples, the features and classes are restricted and larger sizes lead to an error."
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {
20
+ "tags": []
21
+ },
22
+ "source": [
23
+ "### Setup"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "%load_ext autoreload\n",
33
+ "\n",
34
+ "%autoreload 2"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "import time\n",
44
+ "import torch\n",
45
+ "import numpy as np\n",
46
+ "import os\n",
47
+ "import random\n",
48
+ "\n",
49
+ "from model_builder import get_model, get_default_spec, save_model, load_model\n",
50
+ "from scripts.transformer_prediction_interface import transformer_predict, get_params_from_config, TabPFNClassifier\n",
51
+ "\n",
52
+ "from datasets import load_openml_list, open_cc_dids, open_cc_valid_dids\n",
53
+ "\n",
54
+ "from scripts import tabular_metrics"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "base_path = '.'"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "markdown",
68
+ "metadata": {
69
+ "tags": []
70
+ },
71
+ "source": [
72
+ "### Load datasets"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "metadata": {
79
+ "jupyter": {
80
+ "outputs_hidden": true
81
+ },
82
+ "tags": []
83
+ },
84
+ "outputs": [],
85
+ "source": [
86
+ "max_samples = 10000\n",
87
+ "bptt = 10000\n",
88
+ "\n",
89
+ "cc_test_datasets_multiclass, cc_test_datasets_multiclass_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = max_samples, num_feats=100, return_capped=True)\n",
90
+ "cc_valid_datasets_multiclass, cc_valid_datasets_multiclass_df = load_openml_list(open_cc_valid_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = max_samples, num_feats=100, return_capped=True)\n",
91
+ "\n",
92
+ "# Loading longer OpenML Datasets for generalization experiments (optional)\n",
93
+ "# test_datasets_multiclass, test_datasets_multiclass_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 10000, num_feats=100, return_capped=True)\n",
94
+ "\n",
95
+ "random.seed(0)\n",
96
+ "random.shuffle(cc_valid_datasets_multiclass)"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "from datasets import get_openml_classification"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": null,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "dataset = openml.datasets.get_dataset(31)\n",
115
+ "X, y, categorical_indicator, attribute_names = dataset.get_data(\n",
116
+ " dataset_format=\"array\", target=dataset.default_target_attribute\n",
117
+ " )"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": null,
123
+ "metadata": {},
124
+ "outputs": [],
125
+ "source": [
126
+ "def get_datasets(selector, task_type, suite='cc'):\n",
127
+ " if task_type == 'binary':\n",
128
+ " ds = valid_datasets_binary if selector == 'valid' else test_datasets_binary\n",
129
+ " else:\n",
130
+ " if suite == 'openml':\n",
131
+ " ds = valid_datasets_multiclass if selector == 'valid' else test_datasets_multiclass\n",
132
+ " elif suite == 'cc':\n",
133
+ " ds = cc_valid_datasets_multiclass if selector == 'valid' else cc_test_datasets_multiclass\n",
134
+ " else:\n",
135
+ " raise Exception(\"Unknown suite\")\n",
136
+ " return ds"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "model_string, longer, task_type = '', 1, 'multiclass'\n",
146
+ "eval_positions = [1000]\n",
147
+ "bptt = 2000\n",
148
+ " \n",
149
+ "test_datasets, valid_datasets = get_datasets('test', task_type, suite='cc'), get_datasets('valid', task_type, suite='cc')"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "markdown",
154
+ "metadata": {
155
+ "jp-MarkdownHeadingCollapsed": true,
156
+ "tags": []
157
+ },
158
+ "source": [
159
+ "### Select a dataset for prediction"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "[(i, test_datasets[i][0]) for i in range(len(test_datasets))]"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "evaluation_dataset_index = 4 # Index of the dataset to predict\n",
178
+ "ds = test_datasets[evaluation_dataset_index]\n",
179
+ "print(f'Evaluation dataset name: {ds[0]} shape {ds[1].shape}')"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": null,
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "xs, ys = ds[1].clone(), ds[2].clone()\n",
189
+ "eval_position = xs.shape[0] // 2\n",
190
+ "train_xs, train_ys = xs[0:eval_position], ys[0:eval_position]\n",
191
+ "test_xs, test_ys = xs[eval_position:], ys[eval_position:]"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "markdown",
196
+ "metadata": {
197
+ "tags": []
198
+ },
199
+ "source": [
200
+ "### Predict using a Fitted and Tuned Model"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": null,
206
+ "metadata": {},
207
+ "outputs": [],
208
+ "source": [
209
+ "classifier = TabPFNClassifier(device='cpu')\n",
210
+ "classifier.fit(train_xs, train_ys)\n",
211
+ "prediction_ = classifier.predict_proba(test_xs)"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "roc, ce = tabular_metrics.auc_metric(test_ys, prediction_), tabular_metrics.cross_entropy(test_ys, prediction_)\n",
221
+ "'AUC', float(roc), 'Cross Entropy', float(ce)"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": null,
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": []
230
+ }
231
+ ],
232
+ "metadata": {
233
+ "kernelspec": {
234
+ "display_name": "Python 3 (ipykernel)",
235
+ "language": "python",
236
+ "name": "python3"
237
+ },
238
+ "language_info": {
239
+ "codemirror_mode": {
240
+ "name": "ipython",
241
+ "version": 3
242
+ },
243
+ "file_extension": ".py",
244
+ "mimetype": "text/x-python",
245
+ "name": "python",
246
+ "nbconvert_exporter": "python",
247
+ "pygments_lexer": "ipython3",
248
+ "version": "3.7.13"
249
+ }
250
+ },
251
+ "nbformat": 4,
252
+ "nbformat_minor": 4
253
+ }
TabPFN/TabularEvaluationVisualization.ipynb ADDED
The diff for this file is too large to render. See raw diff
TabPFN/TrainingTuningAndPrediction.ipynb ADDED
The diff for this file is too large to render. See raw diff
TabPFN/datasets/__init__.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import numpy as np
4
+ import openml
5
+
6
+
7
+ def get_openml_classification(did, max_samples, multiclass=True, shuffled=True):
8
+ dataset = openml.datasets.get_dataset(did)
9
+ X, y, categorical_indicator, attribute_names = dataset.get_data(
10
+ dataset_format="array", target=dataset.default_target_attribute
11
+ )
12
+
13
+ if not multiclass:
14
+ X = X[y < 2]
15
+ y = y[y < 2]
16
+
17
+ if multiclass and not shuffled:
18
+ raise NotImplementedError("This combination of multiclass and shuffling isn't implemented")
19
+
20
+ if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
21
+ print('Not a NP Array, skipping')
22
+ return None, None, None, None
23
+
24
+ if not shuffled:
25
+ sort = np.argsort(y) if y.mean() < 0.5 else np.argsort(-y)
26
+ pos = int(y.sum()) if y.mean() < 0.5 else int((1 - y).sum())
27
+ X, y = X[sort][-pos * 2:], y[sort][-pos * 2:]
28
+ y = torch.tensor(y).reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).float()
29
+ X = torch.tensor(X).reshape(2, -1, X.shape[1]).transpose(0, 1).reshape(-1, X.shape[1]).flip([0]).float()
30
+ else:
31
+ order = np.arange(y.shape[0])
32
+ np.random.seed(13)
33
+ np.random.shuffle(order)
34
+ X, y = torch.tensor(X[order]), torch.tensor(y[order])
35
+ if max_samples:
36
+ X, y = X[:max_samples], y[:max_samples]
37
+
38
+ return X, y, list(np.where(categorical_indicator)[0]), attribute_names
39
+
40
+ def load_openml_list(dids, filter_for_nan=False
41
+ , num_feats=100
42
+ , min_samples = 100
43
+ , max_samples=400
44
+ , multiclass=True
45
+ , max_num_classes=10
46
+ , shuffled=True
47
+ , return_capped = False):
48
+ datasets = []
49
+ openml_list = openml.datasets.list_datasets(dids)
50
+ print(f'Number of datasets: {len(openml_list)}')
51
+
52
+ datalist = pd.DataFrame.from_dict(openml_list, orient="index")
53
+ if filter_for_nan:
54
+ datalist = datalist[datalist['NumberOfInstancesWithMissingValues'] == 0]
55
+ print(f'Number of datasets after Nan and feature number filtering: {len(datalist)}')
56
+
57
+ for ds in datalist.index:
58
+ modifications = {'samples_capped': False, 'classes_capped': False, 'feats_capped': False}
59
+ entry = datalist.loc[ds]
60
+
61
+ print('Loading', entry['name'], entry.did, '..')
62
+
63
+ if entry['NumberOfClasses'] == 0.0:
64
+ raise Exception("Regression not supported")
65
+ #X, y, categorical_feats, attribute_names = get_openml_regression(int(entry.did), max_samples)
66
+ else:
67
+ X, y, categorical_feats, attribute_names = get_openml_classification(int(entry.did), max_samples
68
+ , multiclass=multiclass, shuffled=shuffled)
69
+ if X is None:
70
+ continue
71
+
72
+ if X.shape[1] > num_feats:
73
+ if return_capped:
74
+ X = X[:, 0:num_feats]
75
+ categorical_feats = [c for c in categorical_feats if c < num_feats]
76
+ modifications['feats_capped'] = True
77
+ else:
78
+ print('Too many features')
79
+ continue
80
+ if X.shape[0] == max_samples:
81
+ modifications['samples_capped'] = True
82
+
83
+ if X.shape[0] < min_samples:
84
+ print(f'Too few samples left')
85
+ continue
86
+
87
+ if len(np.unique(y)) > max_num_classes:
88
+ if return_capped:
89
+ X = X[y < np.unique(y)[10]]
90
+ y = y[y < np.unique(y)[10]]
91
+ modifications['classes_capped'] = True
92
+ else:
93
+ print(f'Too many classes')
94
+ continue
95
+
96
+ datasets += [[entry['name'], X, y, categorical_feats, attribute_names, modifications]]
97
+
98
+ return datasets, datalist
99
+
100
+
101
+ # Classification
102
+ valid_dids_classification = [13, 59, 4, 15, 40710, 43, 1498]
103
+ test_dids_classification = [973, 1596, 40981, 1468, 40984, 40975, 41163, 41147, 1111, 41164, 1169, 1486, 41143, 1461, 41167, 40668, 41146, 41169, 41027, 23517, 41165, 41161, 41159, 41138, 1590, 41166, 1464, 41168, 41150, 1489, 41142, 3, 12, 31, 54, 1067]
104
+ valid_large_classification = [ 943, 23512, 49, 838, 1131, 767, 1142, 748, 1112,
105
+ 1541, 384, 912, 1503, 796, 20, 30, 903, 4541,
106
+ 961, 805, 1000, 4135, 1442, 816, 1130, 906, 1511,
107
+ 184, 181, 137, 1452, 1481, 949, 449, 50, 913,
108
+ 1071, 831, 843, 9, 896, 1532, 311, 39, 451,
109
+ 463, 382, 778, 474, 737, 1162, 1538, 820, 188,
110
+ 452, 1156, 37, 957, 911, 1508, 1054, 745, 1220,
111
+ 763, 900, 25, 387, 38, 757, 1507, 396, 4153,
112
+ 806, 779, 746, 1037, 871, 717, 1480, 1010, 1016,
113
+ 981, 1547, 1002, 1126, 1459, 846, 837, 1042, 273,
114
+ 1524, 375, 1018, 1531, 1458, 6332, 1546, 1129, 679,
115
+ 389]
116
+
117
+ open_cc_dids = [11,
118
+ 14,
119
+ 15,
120
+ 16,
121
+ 18,
122
+ 22,
123
+ 23,
124
+ 29,
125
+ 31,
126
+ 37,
127
+ 50,
128
+ 54,
129
+ 188,
130
+ 458,
131
+ 469,
132
+ 1049,
133
+ 1050,
134
+ 1063,
135
+ 1068,
136
+ 1510,
137
+ 1494,
138
+ 1480,
139
+ 1462,
140
+ 1464,
141
+ 6332,
142
+ 23381,
143
+ 40966,
144
+ 40982,
145
+ 40994,
146
+ 40975]
147
+ # Filtered by N_samples < 2000, N feats < 100, N classes < 10
148
+
149
+ open_cc_valid_dids = [13,25,35,40,41,43,48,49,51,53,55,56,59,61,187,285,329,333,334,335,336,337,338,377,446,450,451,452,460,463,464,466,470,475,481,679,694,717,721,724,733,738,745,747,748,750,753,756,757,764,765,767,774,778,786,788,795,796,798,801,802,810,811,814,820,825,826,827,831,839,840,841,844,852,853,854,860,880,886,895,900,906,907,908,909,915,925,930,931,934,939,940,941,949,966,968,984,987,996,1048,1054,1071,1073,1100,1115,1412,1442,1443,1444,1446,1447,1448,1451,1453,1488,1490,1495,1498,1499,1506,1508,1511,1512,1520,1523,4153,23499,40496,40646,40663,40669,40680,40682,40686,40690,40693,40705,40706,40710,40711,40981,41430,41538,41919,41976,42172,42261,42544,42585,42638]
TabPFN/datasets/utils.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ def normalize_data(eval_xs):
2
+ mean = eval_xs.mean(0)
3
+ std = eval_xs.std(0) + .000001
4
+ eval_xs = (eval_xs - mean) / std
5
+
6
+ return eval_xs
7
+
8
+
TabPFN/decoders.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import random
4
+
5
+
6
+ class ScaledDecoder(nn.Module):
7
+ def __init__(self, ninp, nhid, nout):
8
+ super().__init__()
9
+ self.linear = nn.Linear(ninp, nhid)
10
+ self.linear1 = nn.Linear(nhid, nout)
11
+ self.linear2 = nn.Linear(nhid, 10)
12
+
13
+ def forward(self, x):
14
+ #return torch.cat([self.linear1(x), self.linear2(x)], -1)
15
+ x = self.linear(x)
16
+ x = nn.GELU()(x)
17
+ temps = self.linear2(x).softmax(-1) @ torch.tensor([1.,1.4,1.7,2.,5.,10.,20.,40.,80.,160.], device=x.device)
18
+ if random.random() > .99:
19
+ print(temps.shape,temps[:,:2])
20
+ return self.linear1(x) / temps.unsqueeze(-1)
21
+
22
+ class FixedScaledDecoder(nn.Module):
23
+ def __init__(self, ninp, nhid, nout):
24
+ super().__init__()
25
+ self.mapper = nn.Sequential(nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, nout))
26
+ self.T = nn.Parameter(torch.ones(10000)/10000)
27
+
28
+ def forward(self, x):
29
+ return self.mapper(x)/self.T.sum()
30
+
TabPFN/differentiable_pfn_evaluation.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import time
5
+ import pickle
6
+ from scripts import tabular_metrics
7
+ from scripts.tabular_metrics import calculate_score_per_method
8
+ from scripts.tabular_evaluation import evaluate
9
+ from priors.differentiable_prior import draw_random_style
10
+ from tqdm import tqdm
11
+ import random
12
+ from scripts.transformer_prediction_interface import get_params_from_config, load_model_workflow
13
+
14
+ """
15
+ ===============================
16
+ PUBLIC FUNCTIONS FOR EVALUATION
17
+ ===============================
18
+ """
19
+
20
+
21
+ def eval_model_range(i_range, *args, **kwargs):
22
+ for i in i_range:
23
+ eval_model(i, *args, **kwargs)
24
+
25
+
26
+
27
+ def eval_model(i, e, valid_datasets, test_datasets, train_datasets, eval_positions_valid, eval_positions_test,
28
+ bptt_valid,
29
+ bptt_test, add_name, base_path, device='cpu', eval_addition='', **extra_tuning_args):
30
+ """
31
+ Differentiable model evaliation workflow. Evaluates and saves results to disk.
32
+
33
+ :param i:
34
+ :param e:
35
+ :param valid_datasets:
36
+ :param test_datasets:
37
+ :param train_datasets:
38
+ :param eval_positions_valid:
39
+ :param eval_positions_test:
40
+ :param bptt_valid:
41
+ :param bptt_test:
42
+ :param add_name:
43
+ :param base_path:
44
+ :param device:
45
+ :param eval_addition:
46
+ :param extra_tuning_args:
47
+ :return:
48
+ """
49
+ model, c, results_file = load_model_workflow(i, e, add_name, base_path, device, eval_addition)
50
+ params = {'bptt': bptt_valid
51
+ , 'bptt_final': bptt_test
52
+ , 'eval_positions': eval_positions_valid
53
+ , 'eval_positions_test': eval_positions_test
54
+ , 'valid_datasets': valid_datasets
55
+ , 'test_datasets': test_datasets
56
+ , 'train_datasets': train_datasets
57
+ , 'verbose': True
58
+ , 'device': device
59
+ }
60
+
61
+ params.update(get_params_from_config(c))
62
+
63
+ start = time.time()
64
+ metrics, metrics_valid, style, temperature, optimization_route = evaluate_differentiable_model(model, **params,
65
+ **extra_tuning_args)
66
+ print('Evaluation time: ', time.time() - start)
67
+
68
+ print(results_file)
69
+ r = [c.copy(), metrics, metrics_valid, style.to('cpu'), temperature.to('cpu'), optimization_route]
70
+ with open(results_file, 'wb') as output:
71
+ del r[0]['num_features_used']
72
+ del r[0]['categorical_features_sampler']
73
+ pickle.dump(r, output)
74
+
75
+ _, _, _, style, temperature, _ = r
76
+
77
+ return r, model
78
+
79
+ """
80
+ ===============================
81
+ INTERNAL HELPER FUNCTIONS
82
+ ===============================
83
+ """
84
+
85
+ def evaluate_differentiable_model(model
86
+ , valid_datasets
87
+ , test_datasets
88
+ , train_datasets
89
+ , N_draws=100
90
+ , N_grad_steps=10
91
+ , eval_positions=None
92
+ , eval_positions_test=None
93
+ , bptt=100
94
+ , bptt_final=200
95
+ , style=None
96
+ , n_parallel_configurations=1
97
+ , device='cpu'
98
+ , selection_metric='auc'
99
+ , final_splits=[1, 2, 3, 4, 5]
100
+ , N_ensemble_configurations_list=[1, 5, 10, 20, 50, 100]
101
+ , **kwargs):
102
+ """
103
+ Evaluation function for diffable model evaluation. Returns a list of results.
104
+
105
+ :param model:
106
+ :param valid_datasets:
107
+ :param test_datasets:
108
+ :param train_datasets:
109
+ :param N_draws:
110
+ :param N_grad_steps:
111
+ :param eval_positions:
112
+ :param eval_positions_test:
113
+ :param bptt:
114
+ :param bptt_final:
115
+ :param style:
116
+ :param n_parallel_configurations:
117
+ :param device:
118
+ :param selection_metric:
119
+ :param final_splits:
120
+ :param N_ensemble_configurations_list:
121
+ :param kwargs:
122
+ :return:
123
+ """
124
+ torch.manual_seed(0)
125
+ np.random.seed(0)
126
+ random.seed(0)
127
+
128
+ diffable_metric = tabular_metrics.cross_entropy
129
+ evaluation_metric = tabular_metrics.auc_metric
130
+ if selection_metric in ('auc', 'roc'):
131
+ selection_metric_min_max = 'max'
132
+ selection_metric = tabular_metrics.auc_metric
133
+ evaluation_metric = selection_metric
134
+ elif selection_metric in ('ce', 'selection_metric'):
135
+ selection_metric_min_max = 'min'
136
+ selection_metric = tabular_metrics.cross_entropy
137
+ evaluation_metric = selection_metric
138
+
139
+ print('Diffable metric', diffable_metric, ' Selection metric', selection_metric, ' Evaluation metric',
140
+ evaluation_metric)
141
+ print('N PARALLEL CONFIGURATIONS', n_parallel_configurations)
142
+ print('eval_positions', eval_positions)
143
+
144
+ def evaluate_valid(style, softmax_temperature, results, results_tracked):
145
+ result_valid = eval_step(valid_datasets, style, softmax_temperature=softmax_temperature,
146
+ return_tensor=False, inference_mode=True, selection_metric=selection_metric,
147
+ evaluation_metric=evaluation_metric, eval_positions=eval_positions, bptt=bptt, model=model[2])
148
+ result_valid = [float(result_valid[f'mean_select_at_{pos}']) for pos in eval_positions]
149
+ results += [result_valid]
150
+ results_tracked += [np.nanmean(result_valid)]
151
+
152
+ model[2].to(device)
153
+ model[2].eval()
154
+
155
+ results_on_valid, results_on_valid_tracked = [], []
156
+ best_style, best_softmax_temperature = style, torch.cat(
157
+ [torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)], 0)
158
+ optimization_routes = []
159
+
160
+ best_style = torch.cat([draw_random_style(model[3], device).detach() for n in range(0, n_parallel_configurations)],
161
+ 0)
162
+ best_softmax_temperature = torch.cat([torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)],
163
+ 0)
164
+
165
+
166
+ for _ in tqdm(range(0, N_draws), desc='Iterate over Optimization initializations'): # Evaluates N hparam draws
167
+ style = torch.cat([draw_random_style(model[3], device).detach() for n in range(0, n_parallel_configurations)],
168
+ 0)
169
+ softmax_temperature = torch.cat([torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)],
170
+ 0)
171
+
172
+ evaluate_valid(style, softmax_temperature, results_on_valid, results_on_valid_tracked)
173
+
174
+ print(f'Draw --> Valid Selection metric: {results_on_valid[-1]}')
175
+
176
+ if N_grad_steps > 0:
177
+ gradient_optimize_result = gradient_optimize_style(model, style, N_grad_steps
178
+ , softmax_temperature=softmax_temperature
179
+ , model=model[2]
180
+ , train_datasets=train_datasets
181
+ , valid_datasets=valid_datasets
182
+ , selection_metric_min_max=selection_metric_min_max
183
+ , **kwargs)
184
+ optimization_routes += [gradient_optimize_result['optimization_route']]
185
+
186
+ evaluate_valid(gradient_optimize_result['best_style']
187
+ , gradient_optimize_result['best_temperature']
188
+ , results_on_valid, results_on_valid_tracked)
189
+
190
+ print(f'After diff --> Valid Selection metric: {results_on_valid[-1]}')
191
+
192
+ if selection_metric_min_max == 'min':
193
+ is_best = (results_on_valid_tracked[-1] <= min(results_on_valid_tracked))
194
+ else:
195
+ is_best = (results_on_valid_tracked[-1] >= max(results_on_valid_tracked))
196
+
197
+ if is_best or best_style is None:
198
+ best_style = gradient_optimize_result['best_style'].clone()
199
+ best_softmax_temperature = gradient_optimize_result['best_temperature'].clone()
200
+ torch.cuda.empty_cache()
201
+
202
+ def final_evaluation():
203
+ print('Running eval dataset with final params (no gradients)..')
204
+ print(best_style, best_softmax_temperature)
205
+ result_test = []
206
+ for N_ensemble_configurations in N_ensemble_configurations_list:
207
+ print(f'Running with {N_ensemble_configurations} ensemble_configurations')
208
+ kwargs['N_ensemble_configurations'] = N_ensemble_configurations
209
+ splits = []
210
+ for split in final_splits:
211
+ splits += [eval_step(test_datasets, best_style, softmax_temperature=best_softmax_temperature
212
+ , return_tensor=False, eval_positions=eval_positions_test,
213
+ bptt=bptt_final, inference_mode=True, split_number=split, model=model[2]
214
+ , selection_metric=selection_metric, evaluation_metric=evaluation_metric)]
215
+ result_test += [splits]
216
+
217
+ print('Running valid dataset with final params (no gradients)..')
218
+ result_valid = eval_step(valid_datasets, best_style, softmax_temperature=best_softmax_temperature
219
+ , return_tensor=False, eval_positions=eval_positions_test,
220
+ bptt=bptt_final, inference_mode=True, model=model[2]
221
+ , selection_metric=selection_metric, evaluation_metric=evaluation_metric)
222
+
223
+ return result_test, result_valid
224
+
225
+ result_test, result_valid = final_evaluation()
226
+
227
+ return result_test, result_valid, best_style, best_softmax_temperature, optimization_routes
228
+
229
+
230
+ def eval_step(ds, used_style, selection_metric, evaluation_metric, eval_positions, return_tensor=True, **kwargs):
231
+ def step():
232
+ return evaluate(datasets=ds,
233
+ method='transformer'
234
+ , overwrite=True
235
+ , style=used_style
236
+ , eval_positions=eval_positions
237
+ , metric_used=selection_metric
238
+ , save=False
239
+ , path_interfix=None
240
+ , base_path=None
241
+ , verbose=True
242
+ , **kwargs)
243
+
244
+ if return_tensor:
245
+ r = step()
246
+ else:
247
+ with torch.no_grad():
248
+ r = step()
249
+
250
+ calculate_score_per_method(selection_metric, 'select', r, ds, eval_positions, aggregator='mean')
251
+ calculate_score_per_method(evaluation_metric, 'eval', r, ds, eval_positions, aggregator='mean')
252
+
253
+ return r
254
+
255
+
256
+ def gradient_optimize_style(model, init_style, steps, softmax_temperature, train_datasets, valid_datasets, learning_rate=0.03, optimize_all=False,
257
+ limit_style=True, N_datasets_sampled=90, optimize_softmax_temperature=True, selection_metric_min_max='max', **kwargs):
258
+ """
259
+ Uses gradient based methods to optimize 'style' on the 'train_datasets' and uses stopping with 'valid_datasets'.
260
+
261
+ :param model:
262
+ :param init_style:
263
+ :param steps:
264
+ :param learning_rate:
265
+ :param softmax_temperature:
266
+ :param train_datasets:
267
+ :param valid_datasets:
268
+ :param optimize_all:
269
+ :param limit_style:
270
+ :param N_datasets_sampled:
271
+ :param optimize_softmax_temperature:
272
+ :param selection_metric_min_max:
273
+ :param kwargs:
274
+ :return:
275
+ """
276
+ grad_style = torch.nn.Parameter(init_style.detach(), requires_grad=True)
277
+
278
+ best_style, best_temperature, best_selection_metric, best_diffable_metric = grad_style.detach(), softmax_temperature.detach(), None, None
279
+ softmax_temperature = torch.nn.Parameter(softmax_temperature.detach(), requires_grad=optimize_softmax_temperature)
280
+ variables_to_optimize = model[2].parameters() if optimize_all else [grad_style, softmax_temperature]
281
+ optimizer = torch.optim.Adam(variables_to_optimize, lr=learning_rate)
282
+
283
+ optimization_route_selection, optimization_route_diffable = [], []
284
+ optimization_route_selection_valid, optimization_route_diffable_valid = [], []
285
+
286
+ def eval_opt(ds, return_tensor=True, inference_mode=False):
287
+ result = eval_step(ds, grad_style, softmax_temperature=softmax_temperature, return_tensor=return_tensor
288
+ , inference_mode=inference_mode, model=model[2], **kwargs)
289
+
290
+ diffable_metric = result['mean_metric']
291
+ selection_metric = result['mean_select']
292
+
293
+ return diffable_metric, selection_metric
294
+
295
+ def eval_all_datasets(datasets, propagate=True):
296
+ selection_metrics_this_step, diffable_metrics_this_step = [], []
297
+ for ds in datasets:
298
+ diffable_metric_train, selection_metric_train = eval_opt([ds], inference_mode=(not propagate))
299
+ if not torch.isnan(diffable_metric_train).any():
300
+ if propagate and diffable_metric_train.requires_grad == True:
301
+ diffable_metric_train.backward()
302
+ selection_metrics_this_step += [selection_metric_train]
303
+ diffable_metrics_this_step += [float(diffable_metric_train.detach().cpu().numpy())]
304
+ diffable_metric_train = np.nanmean(diffable_metrics_this_step)
305
+ selection_metric_train = np.nanmean(selection_metrics_this_step)
306
+
307
+ return diffable_metric_train, selection_metric_train
308
+
309
+ for t in tqdm(range(steps), desc='Iterate over Optimization steps'):
310
+ optimizer.zero_grad()
311
+
312
+ # Select subset of datasets
313
+ random.seed(t)
314
+ train_datasets_ = random.sample(train_datasets, N_datasets_sampled)
315
+
316
+ # Get score on train
317
+ diffable_metric_train, selection_metric_train = eval_all_datasets(train_datasets_, propagate=True)
318
+ optimization_route_selection += [float(selection_metric_train)]
319
+ optimization_route_diffable += [float(diffable_metric_train)]
320
+
321
+ # Get score on valid
322
+ diffable_metric_valid, selection_metric_valid = eval_all_datasets(valid_datasets, propagate=False)
323
+ optimization_route_selection_valid += [float(selection_metric_valid)]
324
+ optimization_route_diffable_valid += [float(diffable_metric_valid)]
325
+
326
+ is_best = (selection_metric_min_max == 'min' and best_selection_metric > selection_metric_valid)
327
+ is_best = is_best or (selection_metric_min_max == 'max' and best_selection_metric < selection_metric_valid)
328
+ if (best_selection_metric is None) or (not np.isnan(selection_metric_valid) and is_best):
329
+ print('New best', best_selection_metric, selection_metric_valid)
330
+ best_style = grad_style.detach().clone()
331
+ best_temperature = softmax_temperature.detach().clone()
332
+ best_selection_metric, best_diffable_metric = selection_metric_valid, diffable_metric_valid
333
+
334
+ optimizer.step()
335
+
336
+ if limit_style:
337
+ grad_style = grad_style.detach().clamp(-1.74, 1.74)
338
+
339
+ print(f'Valid: Diffable metric={diffable_metric_valid} Selection metric={selection_metric_valid};' +
340
+ f'Train: Diffable metric={diffable_metric_train} Selection metric={selection_metric_train}')
341
+
342
+ print(f'Return best:{best_style} {best_selection_metric}')
343
+ return {'best_style': best_style, 'best_temperature': best_temperature
344
+ , 'optimization_route': {'select': optimization_route_selection, 'loss': optimization_route_diffable,
345
+ 'test_select': optimization_route_selection_valid, 'test_loss': optimization_route_diffable_valid}}
TabPFN/encoders.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from utils import normalize_data
6
+ import torch.nn.functional as F
7
+ from torch.nn import TransformerEncoder, TransformerEncoderLayer
8
+
9
+
10
+ class StyleEncoder(nn.Module):
11
+ def __init__(self, em_size, hyperparameter_definitions):
12
+ super().__init__()
13
+ # self.embeddings = {}
14
+ self.em_size = em_size
15
+ # self.hyperparameter_definitions = {}
16
+ # for hp in hyperparameter_definitions:
17
+ # self.embeddings[hp] = nn.Linear(1, self.em_size)
18
+ # self.embeddings = nn.ModuleDict(self.embeddings)
19
+ self.embedding = nn.Linear(hyperparameter_definitions.shape[0], self.em_size)
20
+
21
+ def forward(self, hyperparameters): # T x B x num_features
22
+ # Make faster by using matrices
23
+ # sampled_embeddings = [torch.stack([
24
+ # self.embeddings[hp](torch.tensor([batch[hp]], device=self.embeddings[hp].weight.device, dtype=torch.float))
25
+ # for hp in batch
26
+ # ], -1).sum(-1) for batch in hyperparameters]
27
+ # return torch.stack(sampled_embeddings, 0)
28
+ return self.embedding(hyperparameters)
29
+
30
+
31
+ class _PositionalEncoding(nn.Module):
32
+ def __init__(self, d_model, dropout=0.):
33
+ super().__init__()
34
+ self.dropout = nn.Dropout(p=dropout)
35
+ self.d_model = d_model
36
+ self.device_test_tensor = nn.Parameter(torch.tensor(1.))
37
+
38
+ def forward(self, x):# T x B x num_features
39
+ assert self.d_model % x.shape[-1]*2 == 0
40
+ d_per_feature = self.d_model // x.shape[-1]
41
+ pe = torch.zeros(*x.shape, d_per_feature, device=self.device_test_tensor.device)
42
+ #position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
43
+ interval_size = 10
44
+ div_term = (1./interval_size) * 2*math.pi*torch.exp(torch.arange(0, d_per_feature, 2, device=self.device_test_tensor.device).float()*math.log(math.sqrt(2)))
45
+ #print(div_term/2/math.pi)
46
+ pe[..., 0::2] = torch.sin(x.unsqueeze(-1) * div_term)
47
+ pe[..., 1::2] = torch.cos(x.unsqueeze(-1) * div_term)
48
+ return self.dropout(pe).view(x.shape[0],x.shape[1],self.d_model)
49
+
50
+
51
+ Positional = lambda _, emsize: _PositionalEncoding(d_model=emsize)
52
+
53
+ class EmbeddingEncoder(nn.Module):
54
+ def __init__(self, num_features, em_size, num_embs=100):
55
+ super().__init__()
56
+ self.num_embs = num_embs
57
+ self.embeddings = nn.Embedding(num_embs * num_features, em_size, max_norm=True)
58
+ self.init_weights(.1)
59
+ self.min_max = (-2,+2)
60
+
61
+ @property
62
+ def width(self):
63
+ return self.min_max[1] - self.min_max[0]
64
+
65
+ def init_weights(self, initrange):
66
+ self.embeddings.weight.data.uniform_(-initrange, initrange)
67
+
68
+ def discretize(self, x):
69
+ split_size = self.width / self.num_embs
70
+ return (x - self.min_max[0] // split_size).int().clamp(0, self.num_embs - 1)
71
+
72
+ def forward(self, x): # T x B x num_features
73
+ x_idxs = self.discretize(x)
74
+ x_idxs += torch.arange(x.shape[-1], device=x.device).view(1, 1, -1) * self.num_embs
75
+ # print(x_idxs,self.embeddings.weight.shape)
76
+ return self.embeddings(x_idxs).mean(-2)
77
+
78
+
79
+ class Normalize(nn.Module):
80
+ def __init__(self, mean, std):
81
+ super().__init__()
82
+ self.mean = mean
83
+ self.std = std
84
+
85
+ def forward(self, x):
86
+ return (x-self.mean)/self.std
87
+
88
+
89
+ def get_normalized_uniform_encoder(encoder_creator):
90
+ """
91
+ This can be used to wrap an encoder that is fed uniform samples in [0,1] and normalizes these to 0 mean and 1 std.
92
+ For example, it can be used as `encoder_creator = get_normalized_uniform_encoder(encoders.Linear)`, now this can
93
+ be initialized with `encoder_creator(feature_dim, in_dim)`.
94
+ :param encoder:
95
+ :return:
96
+ """
97
+ return lambda in_dim, out_dim: nn.Sequential(Normalize(.5, math.sqrt(1/12)), encoder_creator(in_dim, out_dim))
98
+
99
+
100
+ Linear = nn.Linear
101
+ MLP = lambda num_features, emsize: nn.Sequential(nn.Linear(num_features+1,emsize*2),
102
+ nn.ReLU(),
103
+ nn.Linear(emsize*2,emsize))
104
+
105
+ class NanHandlingEncoder(nn.Module):
106
+ def __init__(self, num_features, emsize, keep_nans=True):
107
+ super().__init__()
108
+ self.num_features = 2 * num_features if keep_nans else num_features
109
+ self.emsize = emsize
110
+ self.keep_nans = keep_nans
111
+ self.layer = nn.Linear(self.num_features, self.emsize)
112
+
113
+ def forward(self, x):
114
+ if self.keep_nans:
115
+ x = torch.cat([torch.nan_to_num(x, nan=0.0), normalize_data(torch.isnan(x) * -1
116
+ + torch.logical_and(torch.isinf(x), torch.sign(x) == 1) * 1
117
+ + torch.logical_and(torch.isinf(x), torch.sign(x) == -1) * 2
118
+ )], -1)
119
+ else:
120
+ x = torch.nan_to_num(x, nan=0.0)
121
+ return self.layer(x)
122
+
123
+ class Linear(nn.Linear):
124
+ def __init__(self, num_features, emsize):
125
+ super().__init__(num_features, emsize)
126
+ self.num_features = num_features
127
+ self.emsize = emsize
128
+
129
+ def forward(self, x):
130
+ x = torch.nan_to_num(x, nan=0.0)
131
+ return super().forward(x)
132
+
133
+ class SequenceSpanningEncoder(nn.Module):
134
+ # Regular Encoder transforms Seq_len, B, S -> Seq_len, B, E attending only to last dimension
135
+ # This Encoder accesses the Seq_Len dimension additionally
136
+
137
+ # Why would we want this? We can learn normalization and embedding of features
138
+ # , this might be more important for e.g. categorical, ordinal feats, nan detection
139
+ # However maybe this can be easily learned through transformer as well?
140
+ # A problem is to make this work across any sequence length and be independent of ordering
141
+
142
+ # We could use average and maximum pooling and use those with a linear layer
143
+
144
+
145
+ # Another idea !! Similar to this we would like to encode features so that their number is variable
146
+ # We would like to embed features, also using knowledge of the features in the entire sequence
147
+
148
+ # We could use convolution or another transformer
149
+ # Convolution:
150
+
151
+ # Transformer/Conv across sequence dimension that encodes and normalizes features
152
+ # -> Transformer across feature dimension that encodes features to a constant size
153
+
154
+ # Conv with flexible features but no sequence info: S,B,F -(reshape)-> S*B,1,F
155
+ # -(Conv1d)-> S*B,N,F -(AvgPool,MaxPool)-> S*B,N,1 -> S,B,N
156
+ # This probably won't work since it's missing a way to recognize which feature is encoded
157
+
158
+ # Transformer with flexible features: S,B,F -> F,B*S,1 -> F2,B*S,1 -> S,B,F2
159
+
160
+ def __init__(self, num_features, em_size):
161
+ super().__init__()
162
+
163
+ raise NotImplementedError()
164
+ # Seq_len, B, S -> Seq_len, B, E
165
+ #
166
+ self.convs = torch.nn.ModuleList([nn.Conv1d(64 if i else 1, 64, 3) for i in range(5)])
167
+ # self.linear = nn.Linear(64, emsize)
168
+
169
+ class TransformerBasedFeatureEncoder(nn.Module):
170
+ def __init__(self, num_features, emsize):
171
+ super().__init__()
172
+
173
+ hidden_emsize = emsize
174
+ encoder = Linear(1, hidden_emsize)
175
+ n_out = emsize
176
+ nhid = 2*emsize
177
+ dropout =0.0
178
+ nhead=4
179
+ nlayers=4
180
+ model = nn.Transformer(nhead=nhead, num_encoder_layers=4, num_decoder_layers=4, d_model=1)
181
+
182
+ def forward(self, *input):
183
+ # S,B,F -> F,S*B,1 -> F2,S*B,1 -> S,B,F2
184
+ input = input.transpose()
185
+ self.model(input)
186
+
187
+ class Conv(nn.Module):
188
+ def __init__(self, input_size, emsize):
189
+ super().__init__()
190
+ self.convs = torch.nn.ModuleList([nn.Conv2d(64 if i else 1, 64, 3) for i in range(5)])
191
+ self.linear = nn.Linear(64,emsize)
192
+
193
+
194
+ def forward(self, x):
195
+ size = math.isqrt(x.shape[-1])
196
+ assert size*size == x.shape[-1]
197
+ x = x.reshape(*x.shape[:-1], 1, size, size)
198
+ for conv in self.convs:
199
+ if x.shape[-1] < 4:
200
+ break
201
+ x = conv(x)
202
+ x.relu_()
203
+ x = nn.AdaptiveAvgPool2d((1,1))(x).squeeze(-1).squeeze(-1)
204
+ return self.linear(x)
205
+
206
+
207
+
208
+
209
+ class CanEmb(nn.Embedding):
210
+ def __init__(self, num_features, num_embeddings: int, embedding_dim: int, *args, **kwargs):
211
+ assert embedding_dim % num_features == 0
212
+ embedding_dim = embedding_dim // num_features
213
+ super().__init__(num_embeddings, embedding_dim, *args, **kwargs)
214
+
215
+ def forward(self, x):
216
+ lx = x.long()
217
+ assert (lx == x).all(), "CanEmb only works with tensors of whole numbers"
218
+ x = super().forward(lx)
219
+ return x.view(*x.shape[:-2], -1)
220
+
221
+ def get_Canonical(num_classes):
222
+ return lambda num_features, emsize: CanEmb(num_features, num_classes, emsize)
223
+
224
+ def get_Embedding(num_embs_per_feature=100):
225
+ return lambda num_features, emsize: EmbeddingEncoder(num_features, emsize, num_embs=num_embs_per_feature)
TabPFN/initializers.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ def get_NormalInitializer(std):
5
+ def initializer(m):
6
+ if isinstance(m, nn.Linear):
7
+ nn.init.normal_(m.weight, 0, std)
8
+ nn.init.normal_(m.bias, 0, std)
9
+ return initializer
TabPFN/layer.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ from torch import nn
4
+ from torch.nn.modules.transformer import *
5
+ from torch.nn.modules.transformer import _get_activation_fn
6
+
7
+ from torch.utils.checkpoint import checkpoint
8
+
9
+
10
+ class TransformerEncoderLayer(Module):
11
+ r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
12
+ This standard encoder layer is based on the paper "Attention Is All You Need".
13
+ Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
14
+ Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
15
+ Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
16
+ in a different way during application.
17
+
18
+ Args:
19
+ d_model: the number of expected features in the input (required).
20
+ nhead: the number of heads in the multiheadattention models (required).
21
+ dim_feedforward: the dimension of the feedforward network model (default=2048).
22
+ dropout: the dropout value (default=0.1).
23
+ activation: the activation function of intermediate layer, relu or gelu (default=relu).
24
+ layer_norm_eps: the eps value in layer normalization components (default=1e-5).
25
+ batch_first: If ``True``, then the input and output tensors are provided
26
+ as (batch, seq, feature). Default: ``False``.
27
+
28
+ Examples::
29
+ >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
30
+ >>> src = torch.rand(10, 32, 512)
31
+ >>> out = encoder_layer(src)
32
+
33
+ Alternatively, when ``batch_first`` is ``True``:
34
+ >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
35
+ >>> src = torch.rand(32, 10, 512)
36
+ >>> out = encoder_layer(src)
37
+ """
38
+ __constants__ = ['batch_first']
39
+
40
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
41
+ layer_norm_eps=1e-5, batch_first=False, pre_norm=False,
42
+ device=None, dtype=None, recompute_attn=False) -> None:
43
+ factory_kwargs = {'device': device, 'dtype': dtype}
44
+ super().__init__()
45
+ self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
46
+ **factory_kwargs)
47
+ # Implementation of Feedforward model
48
+ self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
49
+ self.dropout = Dropout(dropout)
50
+ self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)
51
+
52
+ self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
53
+ self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
54
+ self.dropout1 = Dropout(dropout)
55
+ self.dropout2 = Dropout(dropout)
56
+ self.pre_norm = pre_norm
57
+ self.recompute_attn = recompute_attn
58
+
59
+ self.activation = _get_activation_fn(activation)
60
+
61
+ def __setstate__(self, state):
62
+ if 'activation' not in state:
63
+ state['activation'] = F.relu
64
+ super().__setstate__(state)
65
+
66
+ def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
67
+ r"""Pass the input through the encoder layer.
68
+
69
+ Args:
70
+ src: the sequence to the encoder layer (required).
71
+ src_mask: the mask for the src sequence (optional).
72
+ src_key_padding_mask: the mask for the src keys per batch (optional).
73
+
74
+ Shape:
75
+ see the docs in Transformer class.
76
+ """
77
+ if self.pre_norm:
78
+ src_ = self.norm1(src)
79
+ else:
80
+ src_ = src
81
+ if isinstance(src_mask, tuple):
82
+ # global attention setup
83
+ assert not self.self_attn.batch_first
84
+ assert src_key_padding_mask is None
85
+
86
+ global_src_mask, trainset_src_mask, valset_src_mask = src_mask
87
+
88
+ num_global_tokens = global_src_mask.shape[0]
89
+ num_train_tokens = trainset_src_mask.shape[0]
90
+
91
+ global_tokens_src = src_[:num_global_tokens]
92
+ train_tokens_src = src_[num_global_tokens:num_global_tokens+num_train_tokens]
93
+ global_and_train_tokens_src = src_[:num_global_tokens+num_train_tokens]
94
+ eval_tokens_src = src_[num_global_tokens+num_train_tokens:]
95
+
96
+
97
+ attn = partial(checkpoint, self.self_attn) if self.recompute_attn else self.self_attn
98
+
99
+ global_tokens_src2 = attn(global_tokens_src, global_and_train_tokens_src, global_and_train_tokens_src, None, True, global_src_mask)[0]
100
+ train_tokens_src2 = attn(train_tokens_src, global_tokens_src, global_tokens_src, None, True, trainset_src_mask)[0]
101
+ eval_tokens_src2 = attn(eval_tokens_src, src_, src_,
102
+ None, True, valset_src_mask)[0]
103
+
104
+ src2 = torch.cat([global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0)
105
+
106
+ else:
107
+ if self.recompute_attn:
108
+ src2 = checkpoint(self.self_attn, src_, src_, src_, src_key_padding_mask, True, src_mask)[0]
109
+ else:
110
+ src2 = self.self_attn(src_, src_, src_, attn_mask=src_mask,
111
+ key_padding_mask=src_key_padding_mask)[0]
112
+ src = src + self.dropout1(src2)
113
+ if not self.pre_norm:
114
+ src = self.norm1(src)
115
+
116
+ if self.pre_norm:
117
+ src_ = self.norm2(src)
118
+ else:
119
+ src_ = src
120
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src_))))
121
+ src = src + self.dropout2(src2)
122
+
123
+ if not self.pre_norm:
124
+ src = self.norm2(src)
125
+ return src
TabPFN/losses.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ class CrossEntropyForMulticlassLoss(torch.nn.CrossEntropyLoss):
5
+ # This loss applies cross entropy after reducing the number of prediction
6
+ # dimensions to the number of classes in the target
7
+
8
+ # TODO: loss.item() doesn't work so the displayed losses are Nans
9
+ def __init__(self, num_classes, weight=None, size_average=None, ignore_index: int = -100,
10
+ reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None:
11
+ super().__init__(size_average=size_average, reduce=reduce, reduction=reduction, ignore_index=ignore_index)
12
+ self.num_classes = num_classes
13
+
14
+ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
15
+ loss = torch.zeros_like(input[:, :, 0])
16
+ for b in range(target.shape[1]):
17
+ l = super().forward(input[:, b, 0:len(torch.unique(target[:, b]))], target[:, b])
18
+ loss[:, b] += l
19
+ return loss.flatten()
20
+
21
+ def JointBCELossWithLogits(output, target):
22
+ # output shape: (S, B, NS) with NS = Number of sequences
23
+ # target shape: (S, B, SL)
24
+ # Loss = -log(mean_NS(prod_SL(p(target_SL, output_NS))))
25
+ # Here at the moment NS = SL
26
+ output = output.unsqueeze(-1).repeat(1, 1, 1, target.shape[-1]) # (S, B, NS, SL)
27
+ output = output.permute(2, 0, 1, 3) # (NS, S, B, SL)
28
+ print(target.shape, output.shape)
29
+ loss = (target * torch.sigmoid(output)) + ((1-target) * (1-torch.sigmoid(output)))
30
+ loss = loss.prod(-1)
31
+ loss = loss.mean(0)
32
+ loss = -torch.log(loss)
33
+ loss = loss.mean()
34
+ return loss
35
+
36
+ class ScaledSoftmaxCE(nn.Module):
37
+ def forward(self, x, label):
38
+ logits = x[..., :-10]
39
+ temp_scales = x[..., -10:]
40
+
41
+ logprobs = logits.softmax(-1)
TabPFN/model_builder.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from train import train, Losses
2
+ import priors
3
+ import encoders
4
+
5
+ from collections import defaultdict
6
+
7
+ from priors.utils import trunc_norm_sampler_f, gamma_sampler_f
8
+ from utils import get_uniform_single_eval_pos_sampler
9
+ import torch
10
+ import math
11
+
12
+ def save_model(model, path, filename, config_sample):
13
+ config_sample = {**config_sample}
14
+
15
+ def make_serializable(config_sample):
16
+ if isinstance(config_sample, dict):
17
+ config_sample = {k: make_serializable(config_sample[k]) for k in config_sample}
18
+ if isinstance(config_sample, list):
19
+ config_sample = [make_serializable(v) for v in config_sample]
20
+ if callable(config_sample):
21
+ config_sample = str(config_sample)
22
+ return config_sample
23
+
24
+ #if 'num_features_used' in config_sample:
25
+ # del config_sample['num_features_used']
26
+
27
+ #config_sample['num_classes_as_str'] = str(config_sample['num_classes'])
28
+ #del config_sample['num_classes']
29
+
30
+ config_sample = make_serializable(config_sample)
31
+
32
+ torch.save((model.state_dict(), None, config_sample), os.path.join(path, filename))
33
+
34
+
35
+ import subprocess as sp
36
+ import os
37
+
38
+ def get_gpu_memory():
39
+ command = "nvidia-smi"
40
+ memory_free_info = sp.check_output(command.split()).decode('ascii')
41
+ return memory_free_info
42
+
43
+
44
+ def load_model(path, filename, device, eval_positions, verbose):
45
+ # TODO: This function only restores evaluation functionality but training canät be continued. It is also not flexible.
46
+
47
+ model_state, optimizer_state, config_sample = torch.load(
48
+ os.path.join(path, filename), map_location='cpu')
49
+ if ('differentiable_hyperparameters' in config_sample
50
+ and 'prior_mlp_activations' in config_sample['differentiable_hyperparameters']):
51
+ config_sample['differentiable_hyperparameters']['prior_mlp_activations']['choice_values_used'] = config_sample[
52
+ 'differentiable_hyperparameters'][
53
+ 'prior_mlp_activations'][
54
+ 'choice_values']
55
+ config_sample['differentiable_hyperparameters']['prior_mlp_activations']['choice_values'] = [
56
+ torch.nn.Tanh for k in config_sample['differentiable_hyperparameters']['prior_mlp_activations']['choice_values']]
57
+
58
+ config_sample['categorical_features_sampler'] = lambda: lambda x: ([], [], [])
59
+ config_sample['num_features_used_in_training'] = config_sample['num_features_used']
60
+ config_sample['num_features_used'] = lambda: config_sample['num_features']
61
+ config_sample['num_classes_in_training'] = config_sample['num_classes']
62
+ config_sample['num_classes'] = 2
63
+ config_sample['batch_size_in_training'] = config_sample['batch_size']
64
+ config_sample['batch_size'] = 1
65
+ config_sample['bptt_in_training'] = config_sample['bptt']
66
+ config_sample['bptt'] = 10
67
+ config_sample['bptt_extra_samples_in_training'] = config_sample['bptt_extra_samples']
68
+ config_sample['bptt_extra_samples'] = None
69
+
70
+ #print('Memory', str(get_gpu_memory()))
71
+
72
+ model = get_model(config_sample, device=device, should_train=False, verbose=verbose)
73
+ module_prefix = 'module.'
74
+ model_state = {k.replace(module_prefix, ''): v for k, v in model_state.items()}
75
+ model[2].load_state_dict(model_state)
76
+ model[2].to(device)
77
+
78
+ return model, config_sample
79
+
80
+ def fix_loaded_config_sample(loaded_config_sample, config):
81
+ def copy_to_sample(*k):
82
+ t,s = loaded_config_sample, config
83
+ for k_ in k[:-1]:
84
+ t = t[k_]
85
+ s = s[k_]
86
+ t[k[-1]] = s[k[-1]]
87
+ copy_to_sample('num_features_used')
88
+ copy_to_sample('num_classes')
89
+ copy_to_sample('differentiable_hyperparameters','prior_mlp_activations','choice_values')
90
+
91
+ def load_config_sample(path, template_config):
92
+ model_state, optimizer_state, loaded_config_sample = torch.load(path, map_location='cpu')
93
+ fix_loaded_config_sample(loaded_config_sample, template_config)
94
+ return loaded_config_sample
95
+
96
+ def get_default_spec(test_datasets, valid_datasets):
97
+ bptt = 10000
98
+ eval_positions = [1000, 2000, 3000, 4000, 5000] # list(2 ** np.array([4, 5, 6, 7, 8, 9, 10, 11, 12]))
99
+ max_features = max([X.shape[1] for (_, X, _, _, _, _) in test_datasets] + [X.shape[1] for (_, X, _, _, _, _) in valid_datasets])
100
+ max_splits = 5
101
+
102
+ return bptt, eval_positions, max_features, max_splits
103
+
104
+ def get_mlp_prior_hyperparameters(config):
105
+ config = {hp: (list(config[hp].values())[0]) if type(config[hp]) is dict else config[hp] for hp in config}
106
+
107
+ if "prior_sigma_gamma_k" in config:
108
+ sigma_sampler = gamma_sampler_f(config["prior_sigma_gamma_k"], config["prior_sigma_gamma_theta"])
109
+ config['init_std'] = sigma_sampler
110
+ if "prior_noise_std_gamma_k" in config:
111
+ noise_std_sampler = gamma_sampler_f(config["prior_noise_std_gamma_k"], config["prior_noise_std_gamma_theta"])
112
+ config['noise_std'] = noise_std_sampler
113
+
114
+ return config
115
+
116
+
117
+ def get_gp_mix_prior_hyperparameters(config):
118
+ return {'lengthscale_concentration': config["prior_lengthscale_concentration"],
119
+ 'nu': config["prior_nu"],
120
+ 'outputscale_concentration': config["prior_outputscale_concentration"],
121
+ 'categorical_data': config["prior_y_minmax_norm"],
122
+ 'y_minmax_norm': config["prior_lengthscale_concentration"],
123
+ 'noise_concentration': config["prior_noise_concentration"],
124
+ 'noise_rate': config["prior_noise_rate"]}
125
+
126
+ def get_gp_prior_hyperparameters(config):
127
+ return {hp: (list(config[hp].values())[0]) if type(config[hp]) is dict else config[hp] for hp in config}
128
+
129
+
130
+ def get_meta_gp_prior_hyperparameters(config):
131
+ config = {hp: (list(config[hp].values())[0]) if type(config[hp]) is dict else config[hp] for hp in config}
132
+
133
+ if "outputscale_mean" in config:
134
+ outputscale_sampler = trunc_norm_sampler_f(config["outputscale_mean"]
135
+ , config["outputscale_mean"] * config["outputscale_std_f"])
136
+ config['outputscale'] = outputscale_sampler
137
+ if "lengthscale_mean" in config:
138
+ lengthscale_sampler = trunc_norm_sampler_f(config["lengthscale_mean"],
139
+ config["lengthscale_mean"] * config["lengthscale_std_f"])
140
+ config['lengthscale'] = lengthscale_sampler
141
+
142
+ return config
143
+
144
+
145
+ def get_model(config, device, should_train=True, verbose=False, state_dict=None, epoch_callback=None):
146
+ extra_kwargs = {}
147
+ verbose_train, verbose_prior = verbose >= 1, verbose >= 2
148
+ config['verbose'] = verbose_prior
149
+
150
+ if 'aggregate_k_gradients' not in config or config['aggregate_k_gradients'] is None:
151
+ config['aggregate_k_gradients'] = math.ceil(config['batch_size'] * ((config['nlayers'] * config['emsize'] * config['bptt'] * config['bptt']) / 10824640000))
152
+
153
+ config['num_steps'] = math.ceil(config['num_steps'] * config['aggregate_k_gradients'])
154
+ config['batch_size'] = math.ceil(config['batch_size'] / config['aggregate_k_gradients'])
155
+ config['recompute_attn'] = config['recompute_attn'] if 'recompute_attn' in config else False
156
+
157
+ def make_get_batch(model_proto, **extra_kwargs):
158
+ extra_kwargs = defaultdict(lambda: None, **extra_kwargs)
159
+ return (lambda batch_size, seq_len, num_features, hyperparameters
160
+ , device, model_proto=model_proto, get_batch=extra_kwargs['get_batch']
161
+ , prior_bag_priors=extra_kwargs['prior_bag_priors']: model_proto.get_batch(
162
+ batch_size=batch_size
163
+ , seq_len=seq_len
164
+ , device=device
165
+ , get_batch=get_batch
166
+ , hyperparameters=hyperparameters
167
+ , num_features=num_features))
168
+
169
+ if config['prior_type'] == 'prior_bag':
170
+ # Prior bag combines priors
171
+ get_batch_gp = make_get_batch(priors.fast_gp)
172
+ get_batch_mlp = make_get_batch(priors.mlp)
173
+ if 'flexible' in config and config['flexible']:
174
+ get_batch_gp = make_get_batch(priors.flexible_categorical, **{'get_batch': get_batch_gp})
175
+ get_batch_mlp = make_get_batch(priors.flexible_categorical, **{'get_batch': get_batch_mlp})
176
+ prior_bag_hyperparameters = {'prior_bag_get_batch': (get_batch_gp, get_batch_mlp)
177
+ , 'prior_bag_exp_weights_1': 2.0}
178
+ prior_hyperparameters = {**get_mlp_prior_hyperparameters(config), **get_gp_prior_hyperparameters(config)
179
+ , **prior_bag_hyperparameters}
180
+ model_proto = priors.prior_bag
181
+ else:
182
+ if config['prior_type'] == 'mlp':
183
+ prior_hyperparameters = get_mlp_prior_hyperparameters(config)
184
+ model_proto = priors.mlp
185
+ elif config['prior_type'] == 'gp':
186
+ prior_hyperparameters = get_gp_prior_hyperparameters(config)
187
+ model_proto = priors.fast_gp
188
+ elif config['prior_type'] == 'gp_mix':
189
+ prior_hyperparameters = get_gp_mix_prior_hyperparameters(config)
190
+ model_proto = priors.fast_gp_mix
191
+ else:
192
+ raise Exception()
193
+
194
+ if 'flexible' in config and config['flexible']:
195
+ get_batch_base = make_get_batch(model_proto)
196
+ extra_kwargs['get_batch'] = get_batch_base
197
+ model_proto = priors.flexible_categorical
198
+
199
+ use_style = False
200
+
201
+ if 'differentiable' in config and config['differentiable']:
202
+ get_batch_base = make_get_batch(model_proto, **extra_kwargs)
203
+ extra_kwargs = {'get_batch': get_batch_base, 'differentiable_hyperparameters': config['differentiable_hyperparameters']}
204
+ model_proto = priors.differentiable_prior
205
+ use_style = True
206
+ print(f"Using style prior: {use_style}")
207
+
208
+ if (('nan_prob_no_reason' in config and config['nan_prob_no_reason'] > 0.0) or
209
+ ('nan_prob_a_reason' in config and config['nan_prob_a_reason'] > 0.0) or
210
+ ('nan_prob_unknown_reason' in config and config['nan_prob_unknown_reason'] > 0.0)):
211
+ encoder = encoders.NanHandlingEncoder
212
+ else:
213
+ encoder = encoders.Linear
214
+
215
+ num_outputs = config['num_outputs'] if 'num_outputs' in config else 1
216
+ if config['max_num_classes'] == 2:
217
+ if 'joint_loss' in config and config['joint_loss']:
218
+ loss = JointBCELossWithLogits
219
+ else:
220
+ loss = Losses.bce
221
+ elif config['max_num_classes'] > 2:
222
+ loss = Losses.ce(torch.ones((config['max_num_classes'])))
223
+ else:
224
+ loss = BarDistribution(borders=get_bucket_limits(500, full_range=(-10, 10)))
225
+
226
+ aggregate_k_gradients = 1 if 'aggregate_k_gradients' not in config else config['aggregate_k_gradients']
227
+ check_is_compatible = False if 'multiclass_loss_type' not in config else (config['multiclass_loss_type'] == 'compatible')
228
+ config['multiclass_type'] = config['multiclass_type'] if 'multiclass_type' in config else 'rank'
229
+ config['mix_activations'] = config['mix_activations'] if 'mix_activations' in config else False
230
+
231
+ config['bptt_extra_samples'] = config['bptt_extra_samples'] if 'bptt_extra_samples' in config else None
232
+ config['eval_positions'] = [int(config['bptt'] * 0.95)] if config['bptt_extra_samples'] is None else [int(config['bptt'])]
233
+
234
+ epochs = 0 if not should_train else config['epochs']
235
+ model = train(model_proto.DataLoader
236
+ , loss
237
+ , encoder
238
+ , style_encoder_generator = encoders.StyleEncoder if use_style else None
239
+ , emsize=config['emsize']
240
+ , nhead=config['nhead']
241
+ , y_encoder_generator= encoders.get_Canonical(config['max_num_classes']) if config.get('canonical_y_encoder', False) else encoders.Linear
242
+ , pos_encoder_generator=None
243
+ , batch_size=config['batch_size']
244
+ , nlayers=config['nlayers']
245
+ , nhid=config['emsize'] * config['nhid_factor']
246
+ , epochs=epochs
247
+ , total_available_time_in_s=config.get('total_available_time_in_s', None)
248
+ , warmup_epochs=20
249
+ , bptt=config['bptt']
250
+ , gpu_device=device
251
+ , dropout=config['dropout']
252
+ , steps_per_epoch=config['num_steps']
253
+ , single_eval_pos_gen=get_uniform_single_eval_pos_sampler(config['bptt'])
254
+ , load_weights_from_this_state_dict=state_dict
255
+ , aggregate_k_gradients=aggregate_k_gradients
256
+ , check_is_compatible=check_is_compatible
257
+ , recompute_attn=config['recompute_attn']
258
+ , epoch_callback=epoch_callback
259
+ , bptt_extra_samples = config['bptt_extra_samples']
260
+ , extra_prior_kwargs_dict={
261
+ 'num_features': config['num_features']
262
+ , 'fuse_x_y': False
263
+ , 'hyperparameters': prior_hyperparameters
264
+ , 'num_outputs':num_outputs
265
+ , 'dynamic_batch_size': 1 if ('num_global_att_tokens' in config and config['num_global_att_tokens']) else 2
266
+ , **extra_kwargs
267
+ }
268
+ , lr=config['lr']
269
+ , verbose=verbose_train,
270
+ weight_decay=config.get('weight_decay', 0.0),
271
+ normalize_labels=True)
272
+
273
+ return model
TabPFN/models_diff/gp_ablation_model.cpkt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7b0c8febc553cca3fdee265b5a1cd7567dbf83da855969940be4707a9218ffb
3
+ size 69460013
TabPFN/models_diff/prior_diff_real_checkpoint_n_8x_lr0.0003_epoch_49.cpkt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae97f45bd53d719fc2b23fac4ec55eab16d63892196d939b1bb1c3b408be242
3
+ size 103616779
TabPFN/notebook_utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import io
5
+ import torch
6
+ import pickle
7
+
8
+ def print_models(base_path, model_string):
9
+ print(model_string)
10
+
11
+ for i in range(80):
12
+ for e in range(50):
13
+ exists = Path(os.path.join(base_path, f'models_diff/prior_diff_real_checkpoint{model_string}_n_{i}_epoch_{e}.cpkt')).is_file()
14
+ if exists:
15
+ print(os.path.join(base_path, f'models_diff/prior_diff_real_checkpoint{model_string}_n_{i}_epoch_{e}.cpkt'))
16
+ print()
17
+
18
+ class CustomUnpickler(pickle.Unpickler):
19
+ def find_class(self, module, name):
20
+ if name == 'Manager':
21
+ from settings import Manager
22
+ return Manager
23
+ try:
24
+ return self.find_class_cpu(module, name)
25
+ except:
26
+ return None
27
+
28
+ def find_class_cpu(self, module, name):
29
+ if module == 'torch.storage' and name == '_load_from_bytes':
30
+ return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
31
+ else:
32
+ return super().find_class(module, name)
TabPFN/positional_encodings.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from torch import nn
5
+
6
+
7
+ # Protocol for positonal encodings.
8
+ # __init__(d_model, max_len=..[, more optionals])
9
+ # forward(x: (seq_len, bs, d_model)) -> Tensor of shape (*x.shape[:2],d_model) containing pos. embeddings
10
+
11
+
12
+ class NoPositionalEncoding(nn.Module):
13
+ def __init__(self, d_model, max_len=None):
14
+ super(NoPositionalEncoding, self).__init__()
15
+ pass
16
+
17
+ def forward(self, x):
18
+ return x #* math.sqrt(x.shape[-1])
19
+
20
+
21
+ class PositionalEncoding(nn.Module):
22
+ def __init__(self, d_model, max_len=5000):
23
+ super(PositionalEncoding, self).__init__()
24
+ pe = torch.zeros(max_len, d_model)
25
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
26
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
27
+ pe[:, 0::2] = torch.sin(position * div_term)
28
+ pe[:, 1::2] = torch.cos(position * div_term)
29
+ pe = pe.unsqueeze(0).transpose(0, 1)
30
+ self.register_buffer('pe', pe)
31
+
32
+ def forward(self, x):
33
+ x = self.pe[:x.size(0), :] + x # * math.sqrt(x.shape[-1])
34
+ return x
35
+
36
+
37
+ class LearnedPositionalEncoding(nn.Module):
38
+ def __init__(self, d_model, max_len=5000):
39
+ super(LearnedPositionalEncoding, self).__init__()
40
+ self.max_seq_len = max_len
41
+ #self.positional_embeddings = nn.Embedding(max_len, d_model)
42
+ self.positional_embeddings = nn.Parameter(torch.empty(max_len, d_model))
43
+ nn.init.normal_(self.positional_embeddings, mean=0, std=d_model ** -0.5)
44
+
45
+ def forward(self, x):
46
+ seq_len, bs, d_model = x.shape
47
+ assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
48
+ pos_emb = self.positional_embeddings[:seq_len]
49
+ return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])
50
+
51
+
52
+ class PairedScrambledPositionalEncodings(LearnedPositionalEncoding):
53
+ # TODO check whether it is a problem to use the same perm. for full batch
54
+ def forward(self, x):
55
+ seq_len, bs, d_model = x.shape
56
+ assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
57
+ assert len(self.positional_embeddings) % 2 == 0, 'Please specify an even max_len.'
58
+
59
+ paired_embs = self.positional_embeddings.view(len(self.positional_embeddings), -1, 2)
60
+ pos_emb = paired_embs[torch.randperm(len(paired_embs))].view(*self.positional_embeddings.shape)[:seq_len]
61
+
62
+ return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
TabPFN/prior_tuning_result.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24d2189bbc836aeea888cf6c540f2c1b45b5351822931189e8bf10a0bc80a0b6
3
+ size 18668851
TabPFN/priors/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ from . import fast_gp, mlp, flexible_categorical, differentiable_prior, prior_bag
2
+
3
+
4
+
TabPFN/priors/differentiable_prior.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import math
4
+
5
+ from .utils import get_batch_to_dataloader
6
+ from utils import default_device
7
+ from .utils import order_by_y, normalize_by_used_features_f
8
+
9
+ from .utils import trunc_norm_sampler_f, beta_sampler_f, gamma_sampler_f, uniform_sampler_f, zipf_sampler_f, scaled_beta_sampler_f, uniform_int_sampler_f
10
+
11
+
12
+ def unpack_dict_of_tuples(d):
13
+ # Returns list of dicts where each dict i contains values of tuple position i
14
+ # {'a': (1,2), 'b': (3,4)} -> [{'a': 1, 'b': 3}, {'a': 2, 'b': 4}]
15
+ return [dict(zip(d.keys(), v)) for v in list(zip(*list(d.values())))]
16
+
17
+ class DifferentiableHyperparameter(nn.Module):
18
+ ## We can sample this and get a hyperparameter value and a normalized hyperparameter indicator
19
+ def __init__(self, distribution, embedding_dim, device, **args):
20
+ super(DifferentiableHyperparameter, self).__init__()
21
+
22
+ self.distribution = distribution
23
+ self.embedding_dim = embedding_dim
24
+ self.device=device
25
+ for key in args:
26
+ setattr(self, key, args[key])
27
+
28
+ def get_sampler():
29
+ #if self.distribution == "beta":
30
+ # return beta_sampler_f(self.a, self.b), 0, 1
31
+ #elif self.distribution == "gamma":
32
+ # return gamma_sampler_f(self.a, self.b), 0, 1
33
+ #elif self.distribution == "beta_int":
34
+ # return scaled_beta_sampler_f(self.a, self.b, self.scale, self.min), self.scale + self.min, self.min, self.a / (self.a + self.b)
35
+ if self.distribution == "uniform":
36
+ if not hasattr(self, 'sample'):
37
+ return uniform_sampler_f(self.min, self.max), self.min, self.max, (self.max+self.min) / 2, math.sqrt(1/12*(self.max-self.min)*(self.max-self.min))
38
+ else:
39
+ return lambda: self.sample, self.min, self.max, None, None
40
+ elif self.distribution == "uniform_int":
41
+ return uniform_int_sampler_f(self.min, self.max), self.min, self.max, (self.max+self.min) / 2, math.sqrt(1/12*(self.max-self.min)*(self.max-self.min))
42
+
43
+ if self.distribution.startswith("meta"):
44
+ self.hparams = {}
45
+ def sample_meta(f):
46
+ indicators, passed = unpack_dict_of_tuples({hp: self.hparams[hp]() for hp in self.hparams})
47
+ # sampled_embeddings = list(itertools.chain.from_iterable([sampled_embeddings[k] for k in sampled_embeddings]))
48
+ meta_passed = f(**passed)
49
+ return indicators, meta_passed
50
+
51
+ args_passed = {'device': device, 'embedding_dim': embedding_dim}
52
+ if self.distribution == "meta_beta":
53
+ ## Truncated normal where std and mean are drawn randomly logarithmically scaled
54
+ if hasattr(self, 'b') and hasattr(self, 'k'):
55
+ self.hparams = {'b': lambda: (None, self.b), 'k': lambda: (None, self.k)}
56
+ else:
57
+ self.hparams = {"b": DifferentiableHyperparameter(distribution="uniform", min=self.min
58
+ , max=self.max, **args_passed)
59
+ , "k": DifferentiableHyperparameter(distribution="uniform", min=self.min
60
+ , max=self.max, **args_passed)}
61
+ def make_beta(b, k):
62
+ return lambda b=b, k=k: self.scale * beta_sampler_f(b, k)()
63
+ self.sampler = lambda make_beta=make_beta : sample_meta(make_beta)
64
+ elif self.distribution == "meta_trunc_norm_log_scaled":
65
+ # these choices are copied down below, don't change these without changing `replace_differentiable_distributions`
66
+ self.min_std = self.min_std if hasattr(self, 'min_std') else 0.001
67
+ self.max_std = self.max_std if hasattr(self, 'max_std') else self.max_mean
68
+ ## Truncated normal where std and mean are drawn randomly logarithmically scaled
69
+ if not hasattr(self, 'log_mean'):
70
+ self.hparams = {"log_mean": DifferentiableHyperparameter(distribution="uniform", min=math.log(self.min_mean)
71
+ , max=math.log(self.max_mean), **args_passed)
72
+ , "log_std": DifferentiableHyperparameter(distribution="uniform", min=math.log(self.min_std)
73
+ , max=math.log(self.max_std), **args_passed)}
74
+ else:
75
+ self.hparams = {'log_mean': lambda: (None, self.log_mean), 'log_std': lambda: (None, self.log_std)}
76
+ def make_trunc_norm(log_mean, log_std):
77
+ return ((lambda : self.lower_bound + round(trunc_norm_sampler_f(math.exp(log_mean), math.exp(log_std))())) if self.round
78
+ else (lambda: self.lower_bound + trunc_norm_sampler_f(math.exp(log_mean), math.exp(log_std))()))
79
+
80
+ self.sampler = lambda make_trunc_norm=make_trunc_norm: sample_meta(make_trunc_norm)
81
+ elif self.distribution == "meta_trunc_norm":
82
+ self.min_std = self.min_std if hasattr(self, 'min_std') else 0
83
+ self.max_std = self.max_std if hasattr(self, 'max_std') else self.max_mean
84
+ self.hparams = {"mean": DifferentiableHyperparameter(distribution="uniform", min=self.min_mean
85
+ , max=self.max_mean, **args_passed)
86
+ , "std": DifferentiableHyperparameter(distribution="uniform", min=self.min_std
87
+ , max=self.max_std, **args_passed)}
88
+ def make_trunc_norm(mean, std):
89
+ return ((lambda: self.lower_bound + round(
90
+ trunc_norm_sampler_f(math.exp(mean), math.exp(std))())) if self.round
91
+ else (
92
+ lambda make_trunc_norm=make_trunc_norm: self.lower_bound + trunc_norm_sampler_f(math.exp(mean), math.exp(std))()))
93
+ self.sampler = lambda : sample_meta(make_trunc_norm)
94
+ elif self.distribution == "meta_choice":
95
+ if hasattr(self, 'choice_1_weight'):
96
+ self.hparams = {f'choice_{i}_weight': lambda: (None, getattr(self, f'choice_{i}_weight')) for i in range(1, len(self.choice_values))}
97
+ else:
98
+ self.hparams = {f"choice_{i}_weight": DifferentiableHyperparameter(distribution="uniform", min=-5.0
99
+ , max=6.0, **args_passed) for i in range(1, len(self.choice_values))}
100
+ def make_choice(**choices):
101
+ weights = torch.softmax(torch.tensor([1.0] + [choices[i] for i in choices], dtype=torch.float), 0) # create a tensor of weights
102
+ sample = torch.multinomial(weights, 1, replacement=True).numpy()[0]
103
+ return self.choice_values[sample]
104
+
105
+ self.sampler = lambda make_choice=make_choice: sample_meta(make_choice)
106
+ elif self.distribution == "meta_choice_mixed":
107
+ if hasattr(self, 'choice_1_weight'):
108
+ self.hparams = {f'choice_{i}_weight': lambda: (None, getattr(self, f'choice_{i}_weight')) for i in range(1, len(self.choice_values))}
109
+ else:
110
+ self.hparams = {f"choice_{i}_weight": DifferentiableHyperparameter(distribution="uniform", min=-5.0
111
+ , max=6.0, **args_passed) for i in range(1, len(self.choice_values))}
112
+ def make_choice(**choices):
113
+ weights = torch.softmax(torch.tensor([1.0] + [choices[i] for i in choices], dtype=torch.float), 0) # create a tensor of weights
114
+ def sample():
115
+ s = torch.multinomial(weights, 1, replacement=True).numpy()[0]
116
+ return self.choice_values[s]()
117
+ return lambda: sample
118
+
119
+ self.sampler = lambda make_choice=make_choice: sample_meta(make_choice)
120
+ else:
121
+ def return_two(x, min, max, mean, std):
122
+ # Returns (a hyperparameter value, and an indicator value passed to the model)
123
+ if mean is not None:
124
+ ind = (x-mean)/std#(2 * (x-min) / (max-min) - 1)
125
+ else:
126
+ ind = None
127
+ return ind, x # normalize indicator to [-1, 1]
128
+ # def sample_standard(sampler_f, embedding):
129
+ # s = torch.tensor([sampler_f()], device = self.device)
130
+ # return s, embedding(s)
131
+ self.sampler_f, self.sampler_min, self.sampler_max, self.sampler_mean, self.sampler_std = get_sampler()
132
+ self.sampler = lambda : return_two(self.sampler_f(), min=self.sampler_min, max=self.sampler_max
133
+ , mean=self.sampler_mean, std=self.sampler_std)
134
+ # self.embedding_layer = nn.Linear(1, self.embedding_dim, device=self.device)
135
+ # self.embed = lambda x : self.embedding_layer(
136
+ # (x - self.sampler_min) / (self.sampler_max - self.sampler_min))
137
+ #self.sampler = lambda : sample_standard(self.sampler_f, self.embedding)
138
+
139
+
140
+ def forward(self):
141
+ s, s_passed = self.sampler()
142
+ return s, s_passed
143
+
144
+
145
+
146
+ class DifferentiableHyperparameterList(nn.Module):
147
+ def __init__(self, hyperparameters, embedding_dim, device):
148
+ super().__init__()
149
+
150
+ self.device = device
151
+ hyperparameters = {k: v for (k, v) in hyperparameters.items() if v}
152
+ self.hyperparameters = nn.ModuleDict({hp: DifferentiableHyperparameter(embedding_dim = embedding_dim
153
+ , name = hp
154
+ , device = device, **hyperparameters[hp]) for hp in hyperparameters})
155
+ def get_hyperparameter_info(self):
156
+ sampled_hyperparameters_f, sampled_hyperparameters_keys = [], []
157
+ def append_hp(hp_key, hp_val):
158
+ sampled_hyperparameters_keys.append(hp_key)
159
+ # Function remaps hyperparameters from [-1, 1] range to true value
160
+ s_min, s_max, s_mean, s_std = hp_val.sampler_min, hp_val.sampler_max, hp_val.sampler_mean, hp_val.sampler_std
161
+ sampled_hyperparameters_f.append((lambda x: (x-s_mean)/s_std, lambda y : (y * s_std)+s_mean))
162
+ #sampled_hyperparameters_f.append(((lambda x: ((x - s_min) / (s_max - s_min) * (2) - 1)
163
+ # , (lambda y: ((y + 1) * (1 / 2) * (s_max - s_min) + s_min))))
164
+ for hp in self.hyperparameters:
165
+ hp_val = self.hyperparameters[hp]
166
+ if hasattr(hp_val, 'hparams'):
167
+ for hp_ in hp_val.hparams:
168
+ append_hp(f'{hp}_{hp_}', hp_val.hparams[hp_])
169
+ else:
170
+ append_hp(hp, hp_val)
171
+
172
+
173
+ return sampled_hyperparameters_keys, sampled_hyperparameters_f
174
+
175
+ def sample_parameter_object(self):
176
+ sampled_hyperparameters, s_passed = {}, {}
177
+ for hp in self.hyperparameters:
178
+ sampled_hyperparameters_, s_passed_ = self.hyperparameters[hp]()
179
+ s_passed[hp] = s_passed_
180
+ if isinstance(sampled_hyperparameters_, dict):
181
+ sampled_hyperparameters_ = {hp + '_' + str(key): val for key, val in sampled_hyperparameters_.items()}
182
+ sampled_hyperparameters.update(sampled_hyperparameters_)
183
+ else:
184
+ sampled_hyperparameters[hp] = sampled_hyperparameters_
185
+
186
+ # s_passed contains the values passed to the get_batch function
187
+ # sampled_hyperparameters contains the indicator of the sampled value, i.e. only number that describe the sampled object
188
+ return s_passed, sampled_hyperparameters#self.pack_parameter_object(sampled_embeddings)
189
+
190
+ class DifferentiablePrior(torch.nn.Module):
191
+ def __init__(self, get_batch, hyperparameters, differentiable_hyperparameters, args):
192
+ super(DifferentiablePrior, self).__init__()
193
+
194
+ self.h = hyperparameters
195
+ self.args = args
196
+ self.get_batch = get_batch
197
+ self.differentiable_hyperparameters = DifferentiableHyperparameterList(differentiable_hyperparameters
198
+ , embedding_dim=self.h['emsize']
199
+ , device=self.args['device'])
200
+
201
+ def forward(self):
202
+ # Sample hyperparameters
203
+ sampled_hyperparameters_passed, sampled_hyperparameters_indicators = self.differentiable_hyperparameters.sample_parameter_object()
204
+
205
+ hyperparameters = {**self.h, **sampled_hyperparameters_passed}
206
+ x, y, y_ = self.get_batch(hyperparameters=hyperparameters, **self.args)
207
+
208
+ return x, y, y_, sampled_hyperparameters_indicators
209
+
210
+
211
+ # TODO: Make this a class that keeps objects
212
+ @torch.no_grad()
213
+ def get_batch(batch_size, seq_len, num_features, get_batch
214
+ , device=default_device, differentiable_hyperparameters={}
215
+ , hyperparameters=None, batch_size_per_gp_sample=None, **kwargs):
216
+ batch_size_per_gp_sample = batch_size_per_gp_sample or (min(64, batch_size))
217
+ num_models = batch_size // batch_size_per_gp_sample
218
+ assert num_models * batch_size_per_gp_sample == batch_size, f'Batch size ({batch_size}) not divisible by batch_size_per_gp_sample ({batch_size_per_gp_sample})'
219
+
220
+ args = {'device': device, 'seq_len': seq_len, 'num_features': num_features, 'batch_size': batch_size_per_gp_sample}
221
+
222
+ models = [DifferentiablePrior(get_batch, hyperparameters, differentiable_hyperparameters, args) for _ in range(num_models)]
223
+ sample = sum([[model()] for model in models], [])
224
+
225
+ x, y, y_, hyperparameter_dict = zip(*sample)
226
+
227
+ if 'verbose' in hyperparameters and hyperparameters['verbose']:
228
+ print('Hparams', hyperparameter_dict[0].keys())
229
+
230
+ hyperparameter_matrix = []
231
+ for batch in hyperparameter_dict:
232
+ hyperparameter_matrix.append([batch[hp] for hp in batch])
233
+
234
+ transposed_hyperparameter_matrix = list(zip(*hyperparameter_matrix))
235
+ assert all([all([hp is None for hp in hp_]) or all([hp is not None for hp in hp_]) for hp_ in transposed_hyperparameter_matrix]), 'it should always be the case that when a hyper-parameter is None, once it is always None'
236
+ # we remove columns that are only None (i.e. not sampled)
237
+ hyperparameter_matrix = [[hp for hp in hp_ if hp is not None] for hp_ in hyperparameter_matrix]
238
+ if len(hyperparameter_matrix[0]) > 0:
239
+ packed_hyperparameters = torch.tensor(hyperparameter_matrix)
240
+ packed_hyperparameters = torch.repeat_interleave(packed_hyperparameters, repeats=batch_size_per_gp_sample, dim=0).detach()
241
+ else:
242
+ packed_hyperparameters = None
243
+
244
+ x, y, y_, packed_hyperparameters = (torch.cat(x, 1).detach()
245
+ , torch.cat(y, 1).detach()
246
+ , torch.cat(y_, 1).detach()
247
+ , packed_hyperparameters)#list(itertools.chain.from_iterable(itertools.repeat(x, batch_size_per_gp_sample) for x in packed_hyperparameters)))#torch.repeat_interleave(torch.stack(packed_hyperparameters, 0).detach(), repeats=batch_size_per_gp_sample, dim=0))
248
+
249
+ return x, y, y_, packed_hyperparameters
250
+
251
+ DataLoader = get_batch_to_dataloader(get_batch)
252
+ DataLoader.num_outputs = 1
253
+ #DataLoader.validate = lambda : 0
254
+
255
+ def draw_random_style(dl, device):
256
+ (hp_embedding, data, targets_), targets = next(iter(dl))
257
+ return hp_embedding.to(device)[0:1, :]
258
+
259
+ def merge_style_with_info(diff_hparams_keys, diff_hparams_f, style, transform=True):
260
+ params = dict(zip(diff_hparams_keys, zip(diff_hparams_f, style.detach().cpu().numpy().tolist()[0])))
261
+ def t(v):
262
+ if transform:
263
+ return v[0][1](v[1])
264
+ else:
265
+ return v[1]
266
+ return {k : t(v) for k, v in params.items()}
267
+
268
+
269
+ import ConfigSpace.hyperparameters as CSH
270
+
271
+ def replace_differentiable_distributions(config):
272
+ diff_config = config['differentiable_hyperparameters']
273
+ for name, diff_hp_dict in diff_config.items():
274
+ distribution = diff_hp_dict['distribution']
275
+ if distribution == 'uniform':
276
+ diff_hp_dict['sample'] = CSH.UniformFloatHyperparameter(name, diff_hp_dict['min'], diff_hp_dict['max'])
277
+ elif distribution == 'meta_beta':
278
+ diff_hp_dict['k'] = CSH.UniformFloatHyperparameter(name+'_k', diff_hp_dict['min'], diff_hp_dict['max'])
279
+ diff_hp_dict['b'] = CSH.UniformFloatHyperparameter(name+'_b', diff_hp_dict['min'], diff_hp_dict['max'])
280
+ elif distribution == 'meta_choice':
281
+ for i in range(1, len(diff_hp_dict['choice_values'])):
282
+ diff_hp_dict[f'choice_{i}_weight'] = CSH.UniformFloatHyperparameter(name+f'choice_{i}_weight', -5.0, 6.0)
283
+ elif distribution == 'meta_choice_mixed':
284
+ for i in range(1, len(diff_hp_dict['choice_values'])):
285
+ diff_hp_dict[f'choice_{i}_weight'] = CSH.UniformFloatHyperparameter(name+f'choice_{i}_weight', -5.0, 6.0)
286
+ elif distribution == 'meta_trunc_norm_log_scaled':
287
+ diff_hp_dict['log_mean'] = CSH.UniformFloatHyperparameter(name+'_log_mean', math.log(diff_hp_dict['min_mean']), math.log(diff_hp_dict['max_mean']))
288
+ min_std = diff_hp_dict['min_std'] if 'min_std' in diff_hp_dict else 0.001
289
+ max_std = diff_hp_dict['max_std'] if 'max_std' in diff_hp_dict else diff_hp_dict['max_mean']
290
+ diff_hp_dict['log_std'] = CSH.UniformFloatHyperparameter(name+'_log_std', math.log(min_std), math.log(max_std))
291
+ else:
292
+ raise ValueError(f'Unknown distribution {distribution}')
293
+
TabPFN/priors/fast_gp.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import torch
4
+ from torch import nn
5
+ import gpytorch
6
+
7
+ from .utils import get_batch_to_dataloader
8
+ from utils import default_device
9
+
10
+
11
+ # We will use the simplest form of GP model, exact inference
12
+ class ExactGPModel(gpytorch.models.ExactGP):
13
+ def __init__(self, train_x, train_y, likelihood):
14
+ super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
15
+ self.mean_module = gpytorch.means.ConstantMean()
16
+ self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
17
+
18
+ def forward(self, x):
19
+ mean_x = self.mean_module(x)
20
+ covar_x = self.covar_module(x)
21
+ return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
22
+
23
+
24
+ def get_model(x, y, hyperparameters):
25
+ likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.GreaterThan(1.e-9))
26
+ model = ExactGPModel(x, y, likelihood)
27
+ model.likelihood.noise = torch.ones_like(model.likelihood.noise) * hyperparameters["noise"]
28
+ model.covar_module.outputscale = torch.ones_like(model.covar_module.outputscale) * hyperparameters["outputscale"]
29
+ model.covar_module.base_kernel.lengthscale = torch.ones_like(model.covar_module.base_kernel.lengthscale) * \
30
+ hyperparameters["lengthscale"]
31
+ return model, likelihood
32
+
33
+
34
+ @torch.no_grad()
35
+ def get_batch(batch_size, seq_len, num_features, device=default_device, hyperparameters=None,
36
+ equidistant_x=False, fix_x=None, **kwargs):
37
+ if isinstance(hyperparameters, (tuple, list)):
38
+ hyperparameters = {"noise": hyperparameters[0]
39
+ , "outputscale": hyperparameters[1]
40
+ , "lengthscale": hyperparameters[2]
41
+ , "is_binary_classification": hyperparameters[3]
42
+ # , "num_features_used": hyperparameters[4]
43
+ , "normalize_by_used_features": hyperparameters[5]
44
+ , "order_y": hyperparameters[6]
45
+ , "sampling": hyperparameters[7]
46
+ }
47
+ elif hyperparameters is None:
48
+ hyperparameters = {"noise": .1, "outputscale": .1, "lengthscale": .1}
49
+
50
+ if 'verbose' in hyperparameters and hyperparameters['verbose']:
51
+ print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
52
+ , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size, 'sampling': hyperparameters['sampling']})
53
+
54
+ # hyperparameters = {k: hyperparameters[k]() if callable(hyperparameters[k]) else hyperparameters[k] for k in
55
+ # hyperparameters.keys()}
56
+ assert not (equidistant_x and (fix_x is not None))
57
+
58
+ with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations', (True, True, True))):
59
+ if equidistant_x:
60
+ assert num_features == 1
61
+ x = torch.linspace(0, 1., seq_len).unsqueeze(0).repeat(batch_size, 1).unsqueeze(-1)
62
+ elif fix_x is not None:
63
+ assert fix_x.shape == (seq_len, num_features)
64
+ x = fix_x.unsqueeze(0).repeat(batch_size, 1, 1).to(device)
65
+ else:
66
+ if hyperparameters.get('sampling','uniform') == 'uniform':
67
+ x = torch.rand(batch_size, seq_len, num_features, device=device)
68
+ else:
69
+ x = torch.randn(batch_size, seq_len, num_features, device=device)
70
+ model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
71
+ model.to(device)
72
+ # trained_model = ExactGPModel(train_x, train_y, likelihood).cuda()
73
+ # trained_model.eval()
74
+ is_fitted = False
75
+ while not is_fitted:
76
+ try:
77
+ with gpytorch.settings.prior_mode(True):
78
+ model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
79
+ model.to(device)
80
+
81
+ d = model(x)
82
+ d = likelihood(d)
83
+ sample = d.sample().transpose(0, 1)
84
+ is_fitted = True
85
+ except RuntimeError: # This can happen when torch.linalg.eigh fails. Restart with new init resolves this.
86
+ print('GP Fitting unsuccessful, retrying.. ')
87
+ print(x)
88
+ print(hyperparameters)
89
+
90
+ if bool(torch.any(torch.isnan(x)).detach().cpu().numpy()):
91
+ print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
92
+ , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size})
93
+
94
+ # TODO: Multi output
95
+ return x.transpose(0, 1), sample, sample # x.shape = (T,B,H)
96
+
97
+ DataLoader = get_batch_to_dataloader(get_batch)
98
+ DataLoader.num_outputs = 1
99
+
100
+ def get_model_on_device(x,y,hyperparameters,device):
101
+ model, likelihood = get_model(x, y, hyperparameters)
102
+ model.to(device)
103
+ return model, likelihood
104
+
105
+
106
+ @torch.no_grad()
107
+ def evaluate(x, y, y_non_noisy, use_mse=False, hyperparameters={}, get_model_on_device=get_model_on_device, device=default_device, step_size=1, start_pos=0):
108
+ start_time = time.time()
109
+ losses_after_t = [.0] if start_pos == 0 else []
110
+ all_losses_after_t = []
111
+
112
+ with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))), gpytorch.settings.fast_pred_var(False):
113
+ for t in range(max(start_pos, 1), len(x), step_size):
114
+ loss_sum = 0.
115
+ model, likelihood = get_model_on_device(x[:t].transpose(0, 1), y[:t].transpose(0, 1), hyperparameters, device)
116
+
117
+
118
+ model.eval()
119
+ # print([t.shape for t in model.train_inputs])
120
+ # print(x[:t].transpose(0,1).shape, x[t].unsqueeze(1).shape, y[:t].transpose(0,1).shape)
121
+ f = model(x[t].unsqueeze(1))
122
+ l = likelihood(f)
123
+ means = l.mean.squeeze()
124
+ varis = l.covariance_matrix.squeeze()
125
+ # print(l.variance.squeeze(), l.mean.squeeze(), y[t])
126
+
127
+ assert len(means.shape) == len(varis.shape) == 1
128
+ assert len(means) == len(varis) == x.shape[1]
129
+
130
+ if use_mse:
131
+ c = nn.MSELoss(reduction='none')
132
+ ls = c(means, y[t])
133
+ else:
134
+ ls = -l.log_prob(y[t].unsqueeze(1))
135
+
136
+ losses_after_t.append(ls.mean())
137
+ all_losses_after_t.append(ls.flatten())
138
+ return torch.stack(all_losses_after_t).to('cpu'), torch.tensor(losses_after_t).to('cpu'), time.time() - start_time
139
+
140
+ if __name__ == '__main__':
141
+ hps = (.1,.1,.1)
142
+ for redo_idx in range(1):
143
+ print(
144
+ evaluate(*get_batch(1000, 10, hyperparameters=hps, num_features=10), use_mse=False, hyperparameters=hps))
TabPFN/priors/flexible_categorical.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import random
3
+
4
+ import torch
5
+ from torch import nn
6
+
7
+ from .utils import get_batch_to_dataloader
8
+ from utils import normalize_data, nan_handling_missing_for_unknown_reason_value, nan_handling_missing_for_no_reason_value, nan_handling_missing_for_a_reason_value, to_ranking_low_mem, remove_outliers
9
+ from .utils import normalize_by_used_features_f, randomize_classes, CategoricalActivation
10
+ from .utils import uniform_int_sampler_f
11
+
12
+ time_it = False
13
+
14
+ class BalancedBinarize(nn.Module):
15
+ def __init__(self):
16
+ super().__init__()
17
+
18
+ def forward(self, x):
19
+ return (x > torch.median(x)).float()
20
+
21
+ def class_sampler_f(min_, max_):
22
+ def s():
23
+ if random.random() > 0.5:
24
+ return uniform_int_sampler_f(min_, max_)()
25
+ return 2
26
+ return s
27
+
28
+ class MulticlassRank(nn.Module):
29
+ def __init__(self, num_classes, ordered_p=0.5):
30
+ super().__init__()
31
+ self.num_classes = class_sampler_f(2, num_classes)()
32
+ self.ordered_p = ordered_p
33
+
34
+ def forward(self, x):
35
+ # x has shape (T,B,H)
36
+
37
+ # CAUTION: This samples the same idx in sequence for each class boundary in a batch
38
+ class_boundaries = torch.randint(0, x.shape[0], (self.num_classes - 1,))
39
+ class_boundaries = x[class_boundaries].unsqueeze(1)
40
+
41
+ d = (x > class_boundaries).sum(axis=0)
42
+
43
+ randomized_classes = torch.rand((d.shape[1], )) > self.ordered_p
44
+ d[:, randomized_classes] = randomize_classes(d[:, randomized_classes], self.num_classes)
45
+ reverse_classes = torch.rand((d.shape[1],)) > 0.5
46
+ d[:, reverse_classes] = self.num_classes - 1 - d[:, reverse_classes]
47
+ return d
48
+
49
+ class MulticlassValue(nn.Module):
50
+ def __init__(self, num_classes, ordered_p=0.5):
51
+ super().__init__()
52
+ self.num_classes = class_sampler_f(2, num_classes)()
53
+ self.classes = nn.Parameter(torch.randn(num_classes-1), requires_grad=False)
54
+ self.ordered_p = ordered_p
55
+
56
+ def forward(self, x):
57
+ # x has shape (T,B,H)
58
+ d = (x > (self.classes.unsqueeze(-1).unsqueeze(-1))).sum(axis=0)
59
+
60
+ randomized_classes = torch.rand((d.shape[1],)) > self.ordered_p
61
+ d[:, randomized_classes] = randomize_classes(d[:, randomized_classes], self.num_classes)
62
+ reverse_classes = torch.rand((d.shape[1],)) > 0.5
63
+ d[:, reverse_classes] = self.num_classes - 1 - d[:, reverse_classes]
64
+ return d
65
+
66
+ class MulticlassMultiNode(nn.Module):
67
+ def __init__(self, num_classes, ordered_p=0.5):
68
+ super().__init__()
69
+ self.num_classes = class_sampler_f(2, num_classes)()
70
+ self.classes = nn.Parameter(torch.randn(num_classes-1), requires_grad=False)
71
+ self.alt_multi_class = MulticlassValue(num_classes, ordered_p)
72
+
73
+ def forward(self, x):
74
+ # x has shape T, B, H
75
+ if len(x.shape) == 2:
76
+ return self.alt_multi_class(x)
77
+ T = 3
78
+ x[torch.isnan(x)] = 0.00001
79
+ d = torch.multinomial(torch.pow(0.00001+torch.sigmoid(x[:, :, 0:self.num_classes]).reshape(-1, self.num_classes), T), 1, replacement=True).reshape(x.shape[0], x.shape[1]).float()
80
+ return d
81
+
82
+
83
+ class FlexibleCategorical(torch.nn.Module):
84
+ def __init__(self, get_batch, hyperparameters, args):
85
+ super(FlexibleCategorical, self).__init__()
86
+
87
+ self.h = {k: hyperparameters[k]() if callable(hyperparameters[k]) else hyperparameters[k] for k in
88
+ hyperparameters.keys()}
89
+ self.args = args
90
+ self.args_passed = {**self.args}
91
+ self.args_passed.update({'num_features': self.h['num_features_used']})
92
+ self.get_batch = get_batch
93
+
94
+ if self.h['num_classes'] > 1 and not self.h['balanced']:
95
+ if self.h['multiclass_type'] == 'rank':
96
+ self.class_assigner = MulticlassRank(self.h['num_classes']
97
+ , ordered_p=self.h['output_multiclass_ordered_p']
98
+ )
99
+ elif self.h['multiclass_type'] == 'value':
100
+ self.class_assigner = MulticlassValue(self.h['num_classes']
101
+ , ordered_p=self.h['output_multiclass_ordered_p']
102
+ )
103
+ elif self.h['multiclass_type'] == 'multi_node':
104
+ self.class_assigner = MulticlassMultiNode(self.h['num_classes'])
105
+ else:
106
+ raise ValueError("Unknow Multiclass type")
107
+ elif self.h['num_classes'] == 2 and self.h['balanced']:
108
+ self.class_assigner = BalancedBinarize()
109
+ elif self.h['num_classes'] > 2 and self.h['balanced']:
110
+ raise NotImplementedError("Balanced multiclass training is not possible")
111
+ else:
112
+ self.class_assigner = lambda x:x # Regression
113
+
114
+ def drop_for_reason(self, x, v):
115
+ nan_prob_sampler = CategoricalActivation(ordered_p=0.0
116
+ , categorical_p=1.0
117
+ , keep_activation_size=False,
118
+ num_classes_sampler=lambda: 20)
119
+ d = nan_prob_sampler(x)
120
+ # TODO: Make a different ordering for each activation
121
+ x[d < torch.rand((1,), device=x.device) * 20 * self.h['nan_prob_no_reason'] * random.random()] = v
122
+ return x
123
+
124
+ def drop_for_no_reason(self, x, v):
125
+ x[torch.rand(x.shape, device=self.args['device']) < self.h['nan_prob_no_reason']] = v
126
+ return x
127
+
128
+ def forward(self, batch_size):
129
+ start = time.time()
130
+ x, y, y_ = self.get_batch(hyperparameters=self.h, **self.args_passed)
131
+ if time_it:
132
+ print('Flex Forward Block 1', round(time.time() - start, 3))
133
+
134
+ start = time.time()
135
+
136
+ if self.h['nan_prob_no_reason']+self.h['nan_prob_a_reason']+self.h['nan_prob_unknown_reason'] > 0 and random.random() > 0.5: # Only one out of two datasets should have nans
137
+ if self.h['nan_prob_no_reason'] > 0 and random.random() > 0.5: # Missing for no reason
138
+ x = self.drop_for_no_reason(x, nan_handling_missing_for_no_reason_value(self.h['set_value_to_nan']))
139
+
140
+ if self.h['nan_prob_a_reason'] > 0 and random.random() > 0.5: # Missing for a reason
141
+ x = self.drop_for_reason(x, nan_handling_missing_for_a_reason_value(self.h['set_value_to_nan']))
142
+
143
+ if self.h['nan_prob_unknown_reason'] > 0: # Missing for unknown reason and random.random() > 0.5
144
+ if random.random() < self.h['nan_prob_unknown_reason_reason_prior']:
145
+ x = self.drop_for_no_reason(x, nan_handling_missing_for_unknown_reason_value(self.h['set_value_to_nan']))
146
+ else:
147
+ x = self.drop_for_reason(x, nan_handling_missing_for_unknown_reason_value(self.h['set_value_to_nan']))
148
+
149
+ # Categorical features
150
+ if 'categorical_feature_p' in self.h and random.random() > 1 - self.h['categorical_feature_p']:
151
+ p = random.random()
152
+ for col in range(x.shape[2]):
153
+ m = MulticlassRank(10, ordered_p=0.3)
154
+ if random.random() > p:
155
+ x[:, :, col] = m(x[:, :, col])
156
+
157
+ if time_it:
158
+ print('Flex Forward Block 2', round(time.time() - start, 3))
159
+ start = time.time()
160
+
161
+ if self.h['normalize_to_ranking']:
162
+ x = to_ranking_low_mem(x)
163
+ else:
164
+ x = remove_outliers(x)
165
+ x, y = normalize_data(x), normalize_data(y)
166
+
167
+ if time_it:
168
+ print('Flex Forward Block 3', round(time.time() - start, 3))
169
+ start = time.time()
170
+
171
+ # Cast to classification if enabled
172
+ y = self.class_assigner(y).float()
173
+
174
+ if time_it:
175
+ print('Flex Forward Block 4', round(time.time() - start, 3))
176
+ start = time.time()
177
+ if self.h['normalize_by_used_features']:
178
+ x = normalize_by_used_features_f(x, self.h['num_features_used'], self.args['num_features'], normalize_with_sqrt=self.h.get('normalize_with_sqrt',False))
179
+ if time_it:
180
+ print('Flex Forward Block 5', round(time.time() - start, 3))
181
+
182
+ start = time.time()
183
+ # Append empty features if enabled
184
+ x = torch.cat(
185
+ [x, torch.zeros((x.shape[0], x.shape[1], self.args['num_features'] - self.h['num_features_used']),
186
+ device=self.args['device'])], -1)
187
+ if time_it:
188
+ print('Flex Forward Block 6', round(time.time() - start, 3))
189
+
190
+ return x, y, y # x.shape = (T,B,H)
191
+
192
+ import torch.cuda as cutorch
193
+
194
+ @torch.no_grad()
195
+ def get_batch(batch_size, seq_len, num_features, get_batch, device, hyperparameters=None, batch_size_per_gp_sample=None, **kwargs):
196
+ batch_size_per_gp_sample = batch_size_per_gp_sample or (min(32, batch_size))
197
+ num_models = batch_size // batch_size_per_gp_sample
198
+ assert num_models > 0, f'Batch size ({batch_size}) is too small for batch_size_per_gp_sample ({batch_size_per_gp_sample})'
199
+ assert num_models * batch_size_per_gp_sample == batch_size, f'Batch size ({batch_size}) not divisible by batch_size_per_gp_sample ({batch_size_per_gp_sample})'
200
+
201
+ # Sample one seq_len for entire batch
202
+ seq_len = hyperparameters['seq_len_used']() if callable(hyperparameters['seq_len_used']) else seq_len
203
+
204
+ args = {'device': device, 'seq_len': seq_len, 'num_features': num_features, 'batch_size': batch_size_per_gp_sample}
205
+
206
+ models = [FlexibleCategorical(get_batch, hyperparameters, args).to(device) for _ in range(num_models)]
207
+
208
+ start = time.time()
209
+ sample = sum([[model(batch_size=batch_size_per_gp_sample)] for model in models], [])
210
+ #print('sample', time.time() - start)
211
+
212
+ x, y, y_ = zip(*sample)
213
+ x, y, y_ = torch.cat(x, 1).detach(), torch.cat(y, 1).detach(), torch.cat(y_, 1).detach()
214
+
215
+ # # TODO: Reintegrate this code (Doesn't work on batch dim), could be applied to each batch sample individually
216
+ # if hyperparameters['is_binary_classification'] and hyperparameters['order_y']:
217
+ # x, y = order_by_y(x, y)
218
+
219
+ return x, y, y_
220
+
221
+ # num_features_used = num_features_used_sampler()
222
+ # prior_outputscale = prior_outputscale_sampler()
223
+ # prior_lengthscale = prior_lengthscale_sampler()
224
+ #
225
+ # x, sample = normalize_data(x), normalize_data(sample)
226
+ #
227
+ # if is_binary_classification:
228
+ # sample = (sample > torch.median(sample, dim=0)[0]).float()
229
+ #
230
+ # if normalize_by_used_features:
231
+ # x = normalize_by_used_features_f(x, num_features_used, num_features)
232
+ #
233
+ # # # if is_binary_classification and order_y:
234
+ # # # x, sample = order_by_y(x, sample)
235
+ # #
236
+ # # Append empty features if enabled
237
+ # x = torch.cat([x, torch.zeros((x.shape[0], x.shape[1], num_features - num_features_used), device=device)], -1)
238
+
239
+ DataLoader = get_batch_to_dataloader(get_batch)
240
+ DataLoader.num_outputs = 1
TabPFN/priors/mlp.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import math
3
+
4
+ import torch
5
+ from torch import nn
6
+ import numpy as np
7
+
8
+ from utils import default_device
9
+ from .utils import get_batch_to_dataloader
10
+
11
+ class GaussianNoise(nn.Module):
12
+ def __init__(self, std, device):
13
+ super().__init__()
14
+ self.std = std
15
+ self.device=device
16
+
17
+ def forward(self, x):
18
+ return x + torch.normal(torch.zeros_like(x), self.std)
19
+
20
+
21
+ def causes_sampler_f(num_causes):
22
+ means = np.random.normal(0, 1, (num_causes))
23
+ std = np.abs(np.random.normal(0, 1, (num_causes)) * means)
24
+ return means, std
25
+
26
+ def get_batch(batch_size, seq_len, num_features, hyperparameters, device=default_device, num_outputs=1, sampling='normal', **kwargs):
27
+ if ('mix_activations' in hyperparameters) and hyperparameters['mix_activations']:
28
+ s = hyperparameters['prior_mlp_activations']()
29
+ hyperparameters['prior_mlp_activations'] = lambda : s
30
+
31
+ class MLP(torch.nn.Module):
32
+ def __init__(self, hyperparameters):
33
+ super(MLP, self).__init__()
34
+
35
+ with torch.no_grad():
36
+
37
+ for key in hyperparameters:
38
+ setattr(self, key, hyperparameters[key])
39
+
40
+ assert (self.num_layers >= 2)
41
+
42
+ if 'verbose' in hyperparameters and self.verbose:
43
+ print({k : hyperparameters[k] for k in ['is_causal', 'num_causes', 'prior_mlp_hidden_dim'
44
+ , 'num_layers', 'noise_std', 'y_is_effect', 'pre_sample_weights', 'prior_mlp_dropout_prob'
45
+ , 'pre_sample_causes']})
46
+
47
+ if self.is_causal:
48
+ self.prior_mlp_hidden_dim = max(self.prior_mlp_hidden_dim, num_outputs + 2 * num_features)
49
+ else:
50
+ self.num_causes = num_features
51
+
52
+ # This means that the mean and standard deviation of each cause is determined in advance
53
+ if self.pre_sample_causes:
54
+ self.causes_mean, self.causes_std = causes_sampler_f(self.num_causes)
55
+ self.causes_mean = torch.tensor(self.causes_mean, device=device).unsqueeze(0).unsqueeze(0).tile(
56
+ (seq_len, 1, 1))
57
+ self.causes_std = torch.tensor(self.causes_std, device=device).unsqueeze(0).unsqueeze(0).tile(
58
+ (seq_len, 1, 1))
59
+
60
+ def generate_module(layer_idx, out_dim):
61
+ # Determine std of each noise term in initialization, so that is shared in runs
62
+ # torch.abs(torch.normal(torch.zeros((out_dim)), self.noise_std)) - Change std for each dimension?
63
+ noise = (GaussianNoise(torch.abs(torch.normal(torch.zeros(size=(1, out_dim), device=device), float(self.noise_std))), device=device)
64
+ if self.pre_sample_weights else GaussianNoise(float(self.noise_std), device=device))
65
+ return [
66
+ nn.Sequential(*[self.prior_mlp_activations()
67
+ , nn.Linear(self.prior_mlp_hidden_dim, out_dim)
68
+ , noise])
69
+ ]
70
+
71
+ self.layers = [nn.Linear(self.num_causes, self.prior_mlp_hidden_dim, device=device)]
72
+ self.layers += [module for layer_idx in range(self.num_layers-1) for module in generate_module(layer_idx, self.prior_mlp_hidden_dim)]
73
+ if not self.is_causal:
74
+ self.layers += generate_module(-1, num_outputs)
75
+ self.layers = nn.Sequential(*self.layers)
76
+
77
+ # Initialize Model parameters
78
+ for i, (n, p) in enumerate(self.layers.named_parameters()):
79
+ if self.block_wise_dropout:
80
+ if len(p.shape) == 2: # Only apply to weight matrices and not bias
81
+ nn.init.zeros_(p)
82
+ # TODO: N blocks should be a setting
83
+ n_blocks = random.randint(1, math.ceil(math.sqrt(min(p.shape[0], p.shape[1]))))
84
+ w, h = p.shape[0] // n_blocks, p.shape[1] // n_blocks
85
+ keep_prob = (n_blocks*w*h) / p.numel()
86
+ for block in range(0, n_blocks):
87
+ nn.init.normal_(p[w * block: w * (block+1), h * block: h * (block+1)], std=self.init_std / keep_prob**(1/2))
88
+ else:
89
+ if len(p.shape) == 2: # Only apply to weight matrices and not bias
90
+ dropout_prob = self.prior_mlp_dropout_prob if i > 0 else 0.0 # Don't apply dropout in first layer
91
+ dropout_prob = min(dropout_prob, 0.99)
92
+ nn.init.normal_(p, std=self.init_std / (1. - dropout_prob)**(1/2))
93
+ p *= torch.bernoulli(torch.zeros_like(p) + 1. - dropout_prob)
94
+
95
+ def forward(self):
96
+ def sample_normal():
97
+ if self.pre_sample_causes:
98
+ causes = torch.normal(self.causes_mean, self.causes_std.abs()).float()
99
+ else:
100
+ causes = torch.normal(0., 1., (seq_len, 1, self.num_causes), device=device).float()
101
+ return causes
102
+
103
+ if self.sampling == 'normal':
104
+ causes = sample_normal()
105
+ elif self.sampling == 'mixed':
106
+ zipf_p, multi_p, normal_p = random.random() * 0.66, random.random() * 0.66, random.random() * 0.66
107
+ def sample_cause(n):
108
+ if random.random() > normal_p:
109
+ if self.pre_sample_causes:
110
+ return torch.normal(self.causes_mean[:, :, n], self.causes_std[:, :, n].abs()).float()
111
+ else:
112
+ return torch.normal(0., 1., (seq_len, 1), device=device).float()
113
+ elif random.random() > multi_p:
114
+ x = torch.multinomial(torch.rand((random.randint(2, 10))), seq_len, replacement=True).to(device).unsqueeze(-1).float()
115
+ x = (x - torch.mean(x)) / torch.std(x)
116
+ return x
117
+ else:
118
+ x = torch.minimum(torch.tensor(np.random.zipf(2.0 + random.random() * 2, size=(seq_len)),
119
+ device=device).unsqueeze(-1).float(), torch.tensor(10.0, device=device))
120
+ return x - torch.mean(x)
121
+ causes = torch.cat([sample_cause(n).unsqueeze(-1) for n in range(self.num_causes)], -1)
122
+ elif self.sampling == 'uniform':
123
+ causes = torch.rand((seq_len, 1, self.num_causes), device=device)
124
+ else:
125
+ raise ValueError(f'Sampling is set to invalid setting: {sampling}.')
126
+
127
+ outputs = [causes]
128
+ for layer in self.layers:
129
+ outputs.append(layer(outputs[-1]))
130
+ outputs = outputs[2:]
131
+
132
+ if self.is_causal:
133
+ ## Sample nodes from graph if model is causal
134
+ outputs_flat = torch.cat(outputs, -1)
135
+
136
+ if self.in_clique:
137
+ random_perm = random.randint(0, outputs_flat.shape[-1] - num_outputs - num_features) + torch.randperm(num_outputs + num_features, device=device)
138
+ else:
139
+ random_perm = torch.randperm(outputs_flat.shape[-1]-1, device=device)
140
+
141
+ random_idx_y = list(range(-num_outputs, -0)) if self.y_is_effect else random_perm[0:num_outputs]
142
+ random_idx = random_perm[num_outputs:num_outputs + num_features]
143
+
144
+ if self.sort_features:
145
+ random_idx, _ = torch.sort(random_idx)
146
+ y = outputs_flat[:, :, random_idx_y]
147
+
148
+ x = outputs_flat[:, :, random_idx]
149
+ else:
150
+ y = outputs[-1][:, :, :]
151
+ x = causes
152
+
153
+ if bool(torch.any(torch.isnan(x)).detach().cpu().numpy()) or bool(torch.any(torch.isnan(y)).detach().cpu().numpy()):
154
+ x[:] = 0.0
155
+ y[:] = 1.0
156
+
157
+ return x, y
158
+
159
+ model = MLP(hyperparameters).to(device)
160
+
161
+ sample = sum([[model()] for _ in range(0, batch_size)], [])
162
+
163
+ x, y = zip(*sample)
164
+ y = torch.cat(y, 1).detach().squeeze(2)
165
+ x = torch.cat(x, 1).detach()
166
+ x = x[..., torch.randperm(x.shape[-1])]
167
+
168
+ return x, y, y
169
+
170
+
171
+ DataLoader = get_batch_to_dataloader(get_batch)
172
+ DataLoader.num_outputs = 1
173
+
TabPFN/priors/prior.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import DataLoader
2
+
3
+
4
+ class PriorDataLoader(DataLoader):
5
+ pass
6
+ # init accepts num_steps as first argument
7
+
8
+ # has two attributes set on class or object level:
9
+ # num_features: int and
10
+ # num_outputs: int
11
+ # fuse_x_y: bool
12
+ # Optional: validate function that accepts a transformer model
TabPFN/priors/prior_bag.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from .utils import get_batch_to_dataloader
4
+ from utils import default_device
5
+
6
+ def get_batch(batch_size, seq_len, num_features, device=default_device
7
+ , hyperparameters=None, batch_size_per_gp_sample=None, **kwargs):
8
+ batch_size_per_gp_sample = batch_size_per_gp_sample or (min(64, batch_size))
9
+ num_models = batch_size // batch_size_per_gp_sample
10
+ assert num_models * batch_size_per_gp_sample == batch_size, f'Batch size ({batch_size}) not divisible by batch_size_per_gp_sample ({batch_size_per_gp_sample})'
11
+
12
+ args = {'device': device, 'seq_len': seq_len, 'num_features': num_features, 'batch_size': batch_size_per_gp_sample}
13
+
14
+ prior_bag_priors_get_batch = hyperparameters['prior_bag_get_batch']
15
+ prior_bag_priors_p = [1.0] + [hyperparameters[f'prior_bag_exp_weights_{i}'] for i in range(1, len(prior_bag_priors_get_batch))]
16
+
17
+ weights = torch.tensor(prior_bag_priors_p, dtype=torch.float) # create a tensor of weights
18
+ batch_assignments = torch.multinomial(torch.softmax(weights, 0), num_models, replacement=True).numpy()
19
+
20
+ if 'verbose' in hyperparameters and hyperparameters['verbose']:
21
+ print('PRIOR_BAG:', weights, batch_assignments)
22
+
23
+ sample = sum([[prior_bag_priors_get_batch[int(prior_idx)](hyperparameters=hyperparameters, **args)] for prior_idx in batch_assignments], [])
24
+
25
+ x, y, y_ = zip(*sample)
26
+ x, y, y_ = (torch.cat(x, 1).detach()
27
+ , torch.cat(y, 1).detach()
28
+ , torch.cat(y_, 1).detach())
29
+ return x, y, y_
30
+
31
+ DataLoader = get_batch_to_dataloader(get_batch)
32
+ DataLoader.num_outputs = 1
TabPFN/priors/utils.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import torch
4
+
5
+ from utils import set_locals_in_self
6
+ from .prior import PriorDataLoader
7
+ from torch import nn
8
+ import numpy as np
9
+ import matplotlib.pyplot as plt
10
+ import matplotlib.gridspec as gridspec
11
+ import scipy.stats as stats
12
+ import math
13
+
14
+ def get_batch_to_dataloader(get_batch_method_):
15
+ class DL(PriorDataLoader):
16
+ get_batch_method = get_batch_method_
17
+
18
+ # Caution, you might need to set self.num_features manually if it is not part of the args.
19
+ def __init__(self, num_steps, fuse_x_y=False, **get_batch_kwargs):
20
+ set_locals_in_self(locals())
21
+ # The stuff outside the or is set as class attribute before instantiation.
22
+ self.num_features = get_batch_kwargs.get('num_features') or self.num_features
23
+ self.num_outputs = get_batch_kwargs.get('num_outputs') or self.num_outputs
24
+ print('DataLoader.__dict__', self.__dict__)
25
+
26
+ @staticmethod
27
+ def gbm(*args, fuse_x_y=True, **kwargs):
28
+ dynamic_seq_len = callable(kwargs['seq_len'])
29
+ kwargs['seq_len'] = kwargs['seq_len']() if dynamic_seq_len else kwargs['seq_len']
30
+ # Scales the batch size dynamically with the power of 'dynamic_batch_size'.
31
+ # A transformer with quadratic memory usage in the seq len would need a power of 2 to keep memory constant.
32
+ if dynamic_seq_len and 'dynamic_batch_size' in kwargs and kwargs['dynamic_batch_size'] > 0:
33
+ kwargs['batch_size'] = kwargs['batch_size'] * math.floor(math.pow(kwargs['seq_len_maximum'], kwargs['dynamic_batch_size']) / math.pow(kwargs['seq_len'], kwargs['dynamic_batch_size']))
34
+ batch = get_batch_method_(*args, **kwargs)
35
+ x, y, target_y, style = batch if len(batch) == 4 else (batch[0], batch[1], batch[2], None)
36
+ if fuse_x_y:
37
+ return torch.cat([x, torch.cat([torch.zeros_like(y[:1]), y[:-1]], 0).unsqueeze(-1).float()],
38
+ -1), target_y
39
+ else:
40
+ return (style, x, y), target_y
41
+
42
+ def __len__(self):
43
+ return self.num_steps
44
+
45
+ def __iter__(self):
46
+ return iter(self.gbm(**self.get_batch_kwargs, fuse_x_y=self.fuse_x_y) for _ in range(self.num_steps))
47
+
48
+
49
+ return DL
50
+
51
+ import seaborn as sns
52
+ def plot_features(data, targets, fig=None):
53
+ if torch.is_tensor(data):
54
+ data = data.detach().cpu().numpy()
55
+ targets = targets.detach().cpu().numpy()
56
+ #data = np.concatenate([data, data[:, -1:]], -1)
57
+ #df = pd.DataFrame(data, columns=list(range(0, data.shape[1])))
58
+ #g = sns.pairplot(df, hue=data.shape[1]-1, palette="Set2", diag_kind="kde", height=2.5)
59
+ #plt.legend([], [], frameon=False)
60
+ #g._legend.remove()
61
+ #g = sns.PairGrid(df, hue=data.shape[1]-1)
62
+ #g.map_diag(sns.histplot)
63
+ #g.map_offdiag(sns.scatterplot)
64
+ #g._legend.remove()
65
+
66
+ fig2 = fig if fig else plt.figure(figsize=(8, 8))
67
+ spec2 = gridspec.GridSpec(ncols=data.shape[1], nrows=data.shape[1], figure=fig2)
68
+ for d in range(0, data.shape[1]):
69
+ for d2 in range(0, data.shape[1]):
70
+ sub_ax = fig2.add_subplot(spec2[d, d2])
71
+ if d == d2:
72
+ sns.kdeplot(data[:, d],hue=targets[:],ax=sub_ax,legend=False, palette="deep")
73
+ sub_ax.set(ylabel=None)
74
+ else:
75
+ sns.scatterplot(x=data[:, d], y=data[:, d2],
76
+ hue=targets[:],legend=False, palette="deep")
77
+ #plt.scatter(data[:, d], data[:, d2],
78
+ # c=targets[:])
79
+ sub_ax.get_xaxis().set_ticks([])
80
+ sub_ax.get_yaxis().set_ticks([])
81
+ plt.subplots_adjust(wspace=0.05, hspace=0.05)
82
+ fig2.show()
83
+
84
+
85
+ def plot_prior(prior):
86
+ s = np.array([prior() for _ in range(0, 1000)])
87
+ count, bins, ignored = plt.hist(s, 50, density=True)
88
+ print(s.min())
89
+ plt.show()
90
+
91
+ trunc_norm_sampler_f = lambda mu, sigma : lambda: stats.truncnorm((0 - mu) / sigma, (1000000 - mu) / sigma, loc=mu, scale=sigma).rvs(1)[0]
92
+ beta_sampler_f = lambda a, b : lambda : np.random.beta(a, b)
93
+ gamma_sampler_f = lambda a, b : lambda : np.random.gamma(a, b)
94
+ uniform_sampler_f = lambda a, b : lambda : np.random.uniform(a, b)
95
+ uniform_int_sampler_f = lambda a, b : lambda : round(np.random.uniform(a, b))
96
+ def zipf_sampler_f(a, b, c):
97
+ x = np.arange(b, c)
98
+ weights = x ** (-a)
99
+ weights /= weights.sum()
100
+ return lambda : stats.rv_discrete(name='bounded_zipf', values=(x, weights)).rvs(1)
101
+ scaled_beta_sampler_f = lambda a, b, scale, minimum : lambda : minimum + round(beta_sampler_f(a, b)() * (scale - minimum))
102
+
103
+
104
+ def normalize_by_used_features_f(x, num_features_used, num_features, normalize_with_sqrt=False):
105
+ if normalize_with_sqrt:
106
+ return x / (num_features_used / num_features)**(1 / 2)
107
+ return x / (num_features_used / num_features)
108
+
109
+
110
+ def order_by_y(x, y):
111
+ order = torch.argsort(y if random.randint(0, 1) else -y, dim=0)[:, 0, 0]
112
+ order = order.reshape(2, -1).transpose(0, 1).reshape(-1)#.reshape(seq_len)
113
+ x = x[order] # .reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).reshape(seq_len, 1, -1)
114
+ y = y[order] # .reshape(2, -1).transpose(0, 1).reshape(-1).reshape(seq_len, 1, -1)
115
+
116
+ return x, y
117
+
118
+ def randomize_classes(x, num_classes):
119
+ classes = torch.arange(0, num_classes, device=x.device)
120
+ random_classes = torch.randperm(num_classes, device=x.device).type(x.type())
121
+ x = ((x.unsqueeze(-1) == classes) * random_classes).sum(-1)
122
+ return x
123
+
124
+
125
+ class CategoricalActivation(nn.Module):
126
+ def __init__(self, categorical_p=0.1, ordered_p=0.7
127
+ , keep_activation_size=False
128
+ , num_classes_sampler=zipf_sampler_f(0.8, 1, 10)):
129
+ self.categorical_p = categorical_p
130
+ self.ordered_p = ordered_p
131
+ self.keep_activation_size = keep_activation_size
132
+ self.num_classes_sampler = num_classes_sampler
133
+
134
+ super().__init__()
135
+
136
+ def forward(self, x):
137
+ # x shape: T, B, H
138
+
139
+ x = nn.Softsign()(x)
140
+
141
+ num_classes = self.num_classes_sampler()
142
+ hid_strength = torch.abs(x).mean(0).unsqueeze(0) if self.keep_activation_size else None
143
+
144
+ categorical_classes = torch.rand((x.shape[1], x.shape[2])) < self.categorical_p
145
+ class_boundaries = torch.zeros((num_classes - 1, x.shape[1], x.shape[2]), device=x.device, dtype=x.dtype)
146
+ # Sample a different index for each hidden dimension, but shared for all batches
147
+ for b in range(x.shape[1]):
148
+ for h in range(x.shape[2]):
149
+ ind = torch.randint(0, x.shape[0], (num_classes - 1,))
150
+ class_boundaries[:, b, h] = x[ind, b, h]
151
+
152
+ for b in range(x.shape[1]):
153
+ x_rel = x[:, b, categorical_classes[b]]
154
+ boundaries_rel = class_boundaries[:, b, categorical_classes[b]].unsqueeze(1)
155
+ x[:, b, categorical_classes[b]] = (x_rel > boundaries_rel).sum(dim=0).float() - num_classes / 2
156
+
157
+ ordered_classes = torch.rand((x.shape[1],x.shape[2])) < self.ordered_p
158
+ ordered_classes = torch.logical_and(ordered_classes, categorical_classes)
159
+ x[:, ordered_classes] = randomize_classes(x[:, ordered_classes], num_classes)
160
+
161
+ x = x * hid_strength if self.keep_activation_size else x
162
+
163
+ return x
TabPFN/requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Please use python V 3.7 to be compatible with all packages
2
+ gpytorch==1.5.0
3
+ torch==1.9.0
4
+ scikit-learn==0.24.2
5
+ pyyaml==5.4.1
6
+ seaborn==0.11.2
7
+ xgboost==1.4.0
8
+ tqdm==4.62.1
9
+ numpy==1.21.2
10
+ openml==0.12.2
11
+ catboost==0.26.1
12
+ auto-sklearn==0.14.5
13
+ hyperopt==0.2.5
14
+ configspace==0.4.21
15
+ # autogluon==0.4.0
TabPFN/scripts/baseline_prediction_interface.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tqdm
2
+ import numpy as np
3
+
4
+ def baseline_predict(metric_function, eval_xs, eval_ys, categorical_feats, metric_used=None, eval_pos=2, max_time=300, **kwargs):
5
+ """
6
+ Baseline prediction interface.
7
+ :param metric_function:
8
+ :param eval_xs:
9
+ :param eval_ys:
10
+ :param categorical_feats:
11
+ :param metric_used:
12
+ :param eval_pos:
13
+ :param max_time: Scheduled maximum time
14
+ :param kwargs:
15
+ :return: list [np.array(metrics), np.array(outputs), best_configs] or [None, None, None] if failed
16
+ """
17
+
18
+ metrics = []
19
+ outputs = []
20
+ best_configs = []
21
+ eval_splits = list(zip(eval_xs.transpose(0, 1), eval_ys.transpose(0, 1)))
22
+ for eval_x, eval_y in tqdm.tqdm(eval_splits, desc='Calculating splits'+str(metric_function)+' '+str(eval_pos)):
23
+ try:
24
+ metric, output, best_config = metric_function(eval_x[:eval_pos],
25
+ eval_y[:eval_pos],
26
+ eval_x[eval_pos:],
27
+ eval_y[eval_pos:],
28
+ categorical_feats,
29
+ metric_used=metric_used
30
+ , max_time=max_time)
31
+ metrics += [metric]
32
+ outputs += [output]
33
+ best_configs += [best_config]
34
+ return np.array(metrics), np.array(outputs), best_configs
35
+ except Exception as e:
36
+ print(f'There was an exception in {metric_function}')
37
+ print(e)
38
+ return None, None, None
TabPFN/scripts/differentiable_pfn_evaluation.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import time
5
+ import pickle
6
+ from scripts import tabular_metrics
7
+ from scripts.tabular_metrics import calculate_score_per_method
8
+ from scripts.tabular_evaluation import evaluate
9
+ from priors.differentiable_prior import draw_random_style
10
+ from tqdm import tqdm
11
+ from pathlib import Path
12
+ import random
13
+ from model_builder import load_model
14
+ from scripts.transformer_prediction_interface import get_params_from_config
15
+
16
+ """
17
+ ===============================
18
+ PUBLIC FUNCTIONS FOR EVALUATION
19
+ ===============================
20
+ """
21
+
22
+
23
+ def eval_model_range(i_range, *args, **kwargs):
24
+ for i in i_range:
25
+ eval_model(i, *args, **kwargs)
26
+
27
+
28
+ def load_model_workflow(i, e, add_name, base_path, device='cpu', eval_addition=''):
29
+ """
30
+ Workflow for loading a model and setting appropriate parameters for diffable hparam tuning.
31
+
32
+ :param i:
33
+ :param e:
34
+ :param eval_positions_valid:
35
+ :param add_name:
36
+ :param base_path:
37
+ :param device:
38
+ :param eval_addition:
39
+ :return:
40
+ """
41
+ def check_file(e):
42
+ model_file = f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{e}.cpkt'
43
+ model_path = os.path.join(base_path, model_file)
44
+ # print('Evaluate ', model_path)
45
+ results_file = os.path.join(base_path,
46
+ f'models_diff/prior_diff_real_results{add_name}_n_{i}_epoch_{e}_{eval_addition}.pkl')
47
+ if not Path(model_path).is_file(): # or Path(results_file).is_file():
48
+ return None, None, None
49
+ return model_file, model_path, results_file
50
+
51
+ model_file = None
52
+ if e == -1:
53
+ for e_ in range(100, -1, -1):
54
+ model_file_, model_path_, results_file_ = check_file(e_)
55
+ if model_file_ is not None:
56
+ e = e_
57
+ model_file, model_path, results_file = model_file_, model_path_, results_file_
58
+ break
59
+ else:
60
+ model_file, model_path, results_file = check_file(e)
61
+
62
+ if model_file is None:
63
+ print('No checkpoint found')
64
+ return None
65
+
66
+ print(f'Loading {model_file}')
67
+
68
+ model, c = load_model(base_path, model_file, device, eval_positions=[], verbose=False)
69
+
70
+ return model, c, results_file
71
+
72
+
73
+ def eval_model(i, e, valid_datasets, test_datasets, train_datasets, eval_positions_valid, eval_positions_test,
74
+ bptt_valid,
75
+ bptt_test, add_name, base_path, device='cpu', eval_addition='', **extra_tuning_args):
76
+ """
77
+ Differentiable model evaliation workflow. Evaluates and saves results to disk.
78
+
79
+ :param i:
80
+ :param e:
81
+ :param valid_datasets:
82
+ :param test_datasets:
83
+ :param train_datasets:
84
+ :param eval_positions_valid:
85
+ :param eval_positions_test:
86
+ :param bptt_valid:
87
+ :param bptt_test:
88
+ :param add_name:
89
+ :param base_path:
90
+ :param device:
91
+ :param eval_addition:
92
+ :param extra_tuning_args:
93
+ :return:
94
+ """
95
+ model, c, results_file = load_model_workflow(i, e, add_name, base_path, device, eval_addition)
96
+ params = {'bptt': bptt_valid
97
+ , 'bptt_final': bptt_test
98
+ , 'eval_positions': eval_positions_valid
99
+ , 'eval_positions_test': eval_positions_test
100
+ , 'valid_datasets': valid_datasets
101
+ , 'test_datasets': test_datasets
102
+ , 'train_datasets': train_datasets
103
+ , 'verbose': True
104
+ , 'device': device
105
+ }
106
+
107
+ params.update(get_params_from_config(c))
108
+
109
+ start = time.time()
110
+ metrics, metrics_valid, style, temperature, optimization_route = evaluate_differentiable_model(model, **params,
111
+ **extra_tuning_args)
112
+ print('Evaluation time: ', time.time() - start)
113
+
114
+ print(results_file)
115
+ r = [c.copy(), metrics, metrics_valid, style.to('cpu'), temperature.to('cpu'), optimization_route]
116
+ with open(results_file, 'wb') as output:
117
+ del r[0]['num_features_used']
118
+ del r[0]['categorical_features_sampler']
119
+ pickle.dump(r, output)
120
+
121
+ _, _, _, style, temperature, _ = r
122
+
123
+ return r, model
124
+
125
+ """
126
+ ===============================
127
+ INTERNAL HELPER FUNCTIONS
128
+ ===============================
129
+ """
130
+
131
+ def evaluate_differentiable_model(model
132
+ , valid_datasets
133
+ , test_datasets
134
+ , train_datasets
135
+ , N_draws=100
136
+ , N_grad_steps=10
137
+ , eval_positions=None
138
+ , eval_positions_test=None
139
+ , bptt=100
140
+ , bptt_final=200
141
+ , style=None
142
+ , n_parallel_configurations=1
143
+ , device='cpu'
144
+ , selection_metric='auc'
145
+ , final_splits=[1, 2, 3, 4, 5]
146
+ , N_ensemble_configurations_list=[1, 5, 10, 20, 50, 100]
147
+ , **kwargs):
148
+ """
149
+ Evaluation function for diffable model evaluation. Returns a list of results.
150
+
151
+ :param model:
152
+ :param valid_datasets:
153
+ :param test_datasets:
154
+ :param train_datasets:
155
+ :param N_draws:
156
+ :param N_grad_steps:
157
+ :param eval_positions:
158
+ :param eval_positions_test:
159
+ :param bptt:
160
+ :param bptt_final:
161
+ :param style:
162
+ :param n_parallel_configurations:
163
+ :param device:
164
+ :param selection_metric:
165
+ :param final_splits:
166
+ :param N_ensemble_configurations_list:
167
+ :param kwargs:
168
+ :return:
169
+ """
170
+ torch.manual_seed(0)
171
+ np.random.seed(0)
172
+ random.seed(0)
173
+
174
+ diffable_metric = tabular_metrics.cross_entropy
175
+ evaluation_metric = tabular_metrics.auc_metric
176
+ if selection_metric in ('auc', 'roc'):
177
+ selection_metric_min_max = 'max'
178
+ selection_metric = tabular_metrics.auc_metric
179
+ evaluation_metric = selection_metric
180
+ elif selection_metric in ('ce', 'selection_metric'):
181
+ selection_metric_min_max = 'min'
182
+ selection_metric = tabular_metrics.cross_entropy
183
+ evaluation_metric = selection_metric
184
+
185
+ print('Diffable metric', diffable_metric, ' Selection metric', selection_metric, ' Evaluation metric',
186
+ evaluation_metric)
187
+ print('N PARALLEL CONFIGURATIONS', n_parallel_configurations)
188
+ print('eval_positions', eval_positions)
189
+
190
+ def evaluate_valid(style, softmax_temperature, results, results_tracked):
191
+ result_valid = eval_step(valid_datasets, style, softmax_temperature=softmax_temperature,
192
+ return_tensor=False, inference_mode=True, selection_metric=selection_metric,
193
+ evaluation_metric=evaluation_metric, eval_positions=eval_positions, bptt=bptt, model=model[2])
194
+ result_valid = [float(result_valid[f'mean_select_at_{pos}']) for pos in eval_positions]
195
+ results += [result_valid]
196
+ results_tracked += [np.nanmean(result_valid)]
197
+
198
+ model[2].to(device)
199
+ model[2].eval()
200
+
201
+ results_on_valid, results_on_valid_tracked = [], []
202
+ best_style, best_softmax_temperature = style, torch.cat(
203
+ [torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)], 0)
204
+ optimization_routes = []
205
+
206
+ best_style = torch.cat([draw_random_style(model[3], device).detach() for n in range(0, n_parallel_configurations)],
207
+ 0)
208
+ best_softmax_temperature = torch.cat([torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)],
209
+ 0)
210
+
211
+
212
+ for _ in tqdm(range(0, N_draws), desc='Iterate over Optimization initializations'): # Evaluates N hparam draws
213
+ style = torch.cat([draw_random_style(model[3], device).detach() for n in range(0, n_parallel_configurations)],
214
+ 0)
215
+ softmax_temperature = torch.cat([torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)],
216
+ 0)
217
+
218
+ evaluate_valid(style, softmax_temperature, results_on_valid, results_on_valid_tracked)
219
+
220
+ print(f'Draw --> Valid Selection metric: {results_on_valid[-1]}')
221
+
222
+ if N_grad_steps > 0:
223
+ gradient_optimize_result = gradient_optimize_style(model, style, N_grad_steps
224
+ , softmax_temperature=softmax_temperature
225
+ , model=model[2]
226
+ , train_datasets=train_datasets
227
+ , valid_datasets=valid_datasets
228
+ , selection_metric_min_max=selection_metric_min_max
229
+ , **kwargs)
230
+ optimization_routes += [gradient_optimize_result['optimization_route']]
231
+
232
+ evaluate_valid(gradient_optimize_result['best_style']
233
+ , gradient_optimize_result['best_temperature']
234
+ , results_on_valid, results_on_valid_tracked)
235
+
236
+ print(f'After diff --> Valid Selection metric: {results_on_valid[-1]}')
237
+
238
+ if selection_metric_min_max == 'min':
239
+ is_best = (results_on_valid_tracked[-1] <= min(results_on_valid_tracked))
240
+ else:
241
+ is_best = (results_on_valid_tracked[-1] >= max(results_on_valid_tracked))
242
+
243
+ if is_best or best_style is None:
244
+ best_style = gradient_optimize_result['best_style'].clone()
245
+ best_softmax_temperature = gradient_optimize_result['best_temperature'].clone()
246
+ torch.cuda.empty_cache()
247
+
248
+ def final_evaluation():
249
+ print('Running eval dataset with final params (no gradients)..')
250
+ print(best_style, best_softmax_temperature)
251
+ result_test = []
252
+ for N_ensemble_configurations in N_ensemble_configurations_list:
253
+ print(f'Running with {N_ensemble_configurations} ensemble_configurations')
254
+ kwargs['N_ensemble_configurations'] = N_ensemble_configurations
255
+ splits = []
256
+ for split in final_splits:
257
+ splits += [eval_step(test_datasets, best_style, softmax_temperature=best_softmax_temperature
258
+ , return_tensor=False, eval_positions=eval_positions_test,
259
+ bptt=bptt_final, inference_mode=True, split_number=split, model=model[2]
260
+ , selection_metric=selection_metric, evaluation_metric=evaluation_metric)]
261
+ result_test += [splits]
262
+
263
+ print('Running valid dataset with final params (no gradients)..')
264
+ result_valid = eval_step(valid_datasets, best_style, softmax_temperature=best_softmax_temperature
265
+ , return_tensor=False, eval_positions=eval_positions_test,
266
+ bptt=bptt_final, inference_mode=True, model=model[2]
267
+ , selection_metric=selection_metric, evaluation_metric=evaluation_metric)
268
+
269
+ return result_test, result_valid
270
+
271
+ result_test, result_valid = final_evaluation()
272
+
273
+ return result_test, result_valid, best_style, best_softmax_temperature, optimization_routes
274
+
275
+
276
+ def eval_step(ds, used_style, selection_metric, evaluation_metric, eval_positions, return_tensor=True, **kwargs):
277
+ def step():
278
+ return evaluate(datasets=ds,
279
+ method='transformer'
280
+ , overwrite=True
281
+ , style=used_style
282
+ , eval_positions=eval_positions
283
+ , metric_used=selection_metric
284
+ , save=False
285
+ , path_interfix=None
286
+ , base_path=None
287
+ , verbose=True
288
+ , **kwargs)
289
+
290
+ if return_tensor:
291
+ r = step()
292
+ else:
293
+ with torch.no_grad():
294
+ r = step()
295
+
296
+ calculate_score_per_method(selection_metric, 'select', r, ds, eval_positions, aggregator='mean')
297
+ calculate_score_per_method(evaluation_metric, 'eval', r, ds, eval_positions, aggregator='mean')
298
+
299
+ return r
300
+
301
+
302
+ def gradient_optimize_style(model, init_style, steps, softmax_temperature, train_datasets, valid_datasets, learning_rate=0.03, optimize_all=False,
303
+ limit_style=True, N_datasets_sampled=90, optimize_softmax_temperature=True, selection_metric_min_max='max', **kwargs):
304
+ """
305
+ Uses gradient based methods to optimize 'style' on the 'train_datasets' and uses stopping with 'valid_datasets'.
306
+
307
+ :param model:
308
+ :param init_style:
309
+ :param steps:
310
+ :param learning_rate:
311
+ :param softmax_temperature:
312
+ :param train_datasets:
313
+ :param valid_datasets:
314
+ :param optimize_all:
315
+ :param limit_style:
316
+ :param N_datasets_sampled:
317
+ :param optimize_softmax_temperature:
318
+ :param selection_metric_min_max:
319
+ :param kwargs:
320
+ :return:
321
+ """
322
+ grad_style = torch.nn.Parameter(init_style.detach(), requires_grad=True)
323
+
324
+ best_style, best_temperature, best_selection_metric, best_diffable_metric = grad_style.detach(), softmax_temperature.detach(), None, None
325
+ softmax_temperature = torch.nn.Parameter(softmax_temperature.detach(), requires_grad=optimize_softmax_temperature)
326
+ variables_to_optimize = model[2].parameters() if optimize_all else [grad_style, softmax_temperature]
327
+ optimizer = torch.optim.Adam(variables_to_optimize, lr=learning_rate)
328
+
329
+ optimization_route_selection, optimization_route_diffable = [], []
330
+ optimization_route_selection_valid, optimization_route_diffable_valid = [], []
331
+
332
+ def eval_opt(ds, return_tensor=True, inference_mode=False):
333
+ result = eval_step(ds, grad_style, softmax_temperature=softmax_temperature, return_tensor=return_tensor
334
+ , inference_mode=inference_mode, model=model[2], **kwargs)
335
+
336
+ diffable_metric = result['mean_metric']
337
+ selection_metric = result['mean_select']
338
+
339
+ return diffable_metric, selection_metric
340
+
341
+ def eval_all_datasets(datasets, propagate=True):
342
+ selection_metrics_this_step, diffable_metrics_this_step = [], []
343
+ for ds in datasets:
344
+ diffable_metric_train, selection_metric_train = eval_opt([ds], inference_mode=(not propagate))
345
+ if not torch.isnan(diffable_metric_train).any():
346
+ if propagate and diffable_metric_train.requires_grad == True:
347
+ diffable_metric_train.backward()
348
+ selection_metrics_this_step += [selection_metric_train]
349
+ diffable_metrics_this_step += [float(diffable_metric_train.detach().cpu().numpy())]
350
+ diffable_metric_train = np.nanmean(diffable_metrics_this_step)
351
+ selection_metric_train = np.nanmean(selection_metrics_this_step)
352
+
353
+ return diffable_metric_train, selection_metric_train
354
+
355
+ for t in tqdm(range(steps), desc='Iterate over Optimization steps'):
356
+ optimizer.zero_grad()
357
+
358
+ # Select subset of datasets
359
+ random.seed(t)
360
+ train_datasets_ = random.sample(train_datasets, N_datasets_sampled)
361
+
362
+ # Get score on train
363
+ diffable_metric_train, selection_metric_train = eval_all_datasets(train_datasets_, propagate=True)
364
+ optimization_route_selection += [float(selection_metric_train)]
365
+ optimization_route_diffable += [float(diffable_metric_train)]
366
+
367
+ # Get score on valid
368
+ diffable_metric_valid, selection_metric_valid = eval_all_datasets(valid_datasets, propagate=False)
369
+ optimization_route_selection_valid += [float(selection_metric_valid)]
370
+ optimization_route_diffable_valid += [float(diffable_metric_valid)]
371
+
372
+ is_best = (selection_metric_min_max == 'min' and best_selection_metric > selection_metric_valid)
373
+ is_best = is_best or (selection_metric_min_max == 'max' and best_selection_metric < selection_metric_valid)
374
+ if (best_selection_metric is None) or (not np.isnan(selection_metric_valid) and is_best):
375
+ print('New best', best_selection_metric, selection_metric_valid)
376
+ best_style = grad_style.detach().clone()
377
+ best_temperature = softmax_temperature.detach().clone()
378
+ best_selection_metric, best_diffable_metric = selection_metric_valid, diffable_metric_valid
379
+
380
+ optimizer.step()
381
+
382
+ if limit_style:
383
+ grad_style = grad_style.detach().clamp(-1.74, 1.74)
384
+
385
+ print(f'Valid: Diffable metric={diffable_metric_valid} Selection metric={selection_metric_valid};' +
386
+ f'Train: Diffable metric={diffable_metric_train} Selection metric={selection_metric_train}')
387
+
388
+ print(f'Return best:{best_style} {best_selection_metric}')
389
+ return {'best_style': best_style, 'best_temperature': best_temperature
390
+ , 'optimization_route': {'select': optimization_route_selection, 'loss': optimization_route_diffable,
391
+ 'test_select': optimization_route_selection_valid, 'test_loss': optimization_route_diffable_valid}}
TabPFN/scripts/model_configs.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ from priors.utils import uniform_int_sampler_f
3
+ from priors.differentiable_prior import DifferentiableHyperparameter
4
+ from ConfigSpace import hyperparameters as CSH
5
+ import torch
6
+ from priors.differentiable_prior import replace_differentiable_distributions
7
+
8
+ import ConfigSpace as CS
9
+
10
+ def get_general_config(max_features, bptt, eval_positions=None):
11
+ """"
12
+ Returns the general PFN training hyperparameters.
13
+ """
14
+ config_general = {
15
+ "lr": CSH.UniformFloatHyperparameter('lr', lower=0.00002, upper=0.0002, log=True),
16
+ "dropout": CSH.CategoricalHyperparameter('dropout', [0.0]),
17
+ "emsize": CSH.CategoricalHyperparameter('emsize', [2 ** i for i in range(8, 9)]), ## upper bound is -1
18
+ "batch_size": CSH.CategoricalHyperparameter('batch_size', [2 ** i for i in range(8, 9)]),
19
+ "nlayers": CSH.CategoricalHyperparameter('nlayers', [12]),
20
+ "num_features": max_features,
21
+ "nhead": CSH.CategoricalHyperparameter('nhead', [4]),
22
+ "nhid_factor": 2,
23
+ "bptt": bptt,
24
+ "eval_positions": None,
25
+ "seq_len_used": bptt,
26
+ "sampling": 'normal',#hp.choice('sampling', ['mixed', 'normal']), # uniform
27
+ "epochs": 80,
28
+ "num_steps": 100,
29
+ "verbose": False,
30
+ "pre_sample_causes": True, # This is MLP
31
+ "mix_activations": False,#hp.choice('mix_activations', [True, False]),
32
+ }
33
+
34
+ return config_general
35
+
36
+ def get_flexible_categorical_config(max_features):
37
+ """"
38
+ Returns the configuration parameters for the tabular multiclass wrapper.
39
+ """
40
+ config_flexible_categorical = {
41
+ "nan_prob_unknown_reason_reason_prior": CSH.CategoricalHyperparameter('nan_prob_unknown_reason_reason_prior', [1.0]),
42
+ "categorical_feature_p": CSH.CategoricalHyperparameter('categorical_feature_p', [0.0]),
43
+ "nan_prob_no_reason": CSH.CategoricalHyperparameter('nan_prob_no_reason', [0.0, 0.1, 0.2]),
44
+ "nan_prob_unknown_reason": CSH.CategoricalHyperparameter('nan_prob_unknown_reason', [0.0]),
45
+ "nan_prob_a_reason": CSH.CategoricalHyperparameter('nan_prob_a_reason', [0.0]),
46
+ # "num_classes": lambda : random.randint(2, 10), "balanced": False,
47
+ "max_num_classes": 2,
48
+ "num_classes": 2,
49
+ "noise_type": CSH.CategoricalHyperparameter('noise_type', ["Gaussian"]), # NN
50
+ "balanced": True,
51
+ "normalize_to_ranking": CSH.CategoricalHyperparameter('normalize_to_ranking', [False]),
52
+ "set_value_to_nan": CSH.CategoricalHyperparameter('set_value_to_nan', [0.5, 0.2, 0.0]),
53
+ "normalize_by_used_features": True,
54
+ "num_features_used":
55
+ {'uniform_int_sampler_f(3,max_features)': uniform_int_sampler_f(1, max_features)}
56
+ # hp.choice('conv_activation', [{'distribution': 'uniform', 'min': 2.0, 'max': 8.0}, None]),
57
+ }
58
+ return config_flexible_categorical
59
+
60
+ def get_diff_flex():
61
+ """"
62
+ Returns the configuration parameters for a differentiable wrapper around the tabular multiclass wrapper.
63
+ """
64
+ diff_flex = {
65
+ # "ordinal_pct": {'distribution': 'uniform', 'min': 0.0, 'max': 0.5},
66
+ # "num_categorical_features_sampler_a": hp.choice('num_categorical_features_sampler_a',
67
+ # [{'distribution': 'uniform', 'min': 0.3, 'max': 0.9}, None]),
68
+ # "num_categorical_features_sampler_b": {'distribution': 'uniform', 'min': 0.3, 'max': 0.9},
69
+ "output_multiclass_ordered_p": {'distribution': 'uniform', 'min': 0.0, 'max': 0.5}, #CSH.CategoricalHyperparameter('output_multiclass_ordered_p', [0.0, 0.1, 0.2]),
70
+ "multiclass_type": {'distribution': 'meta_choice', 'choice_values': ['value', 'rank']},
71
+ }
72
+
73
+ return diff_flex
74
+
75
+ def get_diff_gp():
76
+ """"
77
+ Returns the configuration parameters for a differentiable wrapper around GP.
78
+ """
79
+ diff_gp = {
80
+ 'outputscale': {'distribution': 'meta_trunc_norm_log_scaled', 'max_mean': 10., 'min_mean': 0.00001, 'round': False,
81
+ 'lower_bound': 0},
82
+ 'lengthscale': {'distribution': 'meta_trunc_norm_log_scaled', 'max_mean': 10., 'min_mean': 0.00001, 'round': False,
83
+ 'lower_bound': 0},
84
+ 'noise': {'distribution': 'meta_choice', 'choice_values': [0.00001, 0.0001, 0.01]}
85
+ }
86
+
87
+ return diff_gp
88
+
89
+ def get_diff_causal():
90
+ """"
91
+ Returns the configuration parameters for a differentiable wrapper around MLP / Causal mixture.
92
+ """
93
+ diff_causal = {
94
+ "num_layers": {'distribution': 'meta_trunc_norm_log_scaled', 'max_mean': 6, 'min_mean': 1, 'round': True,
95
+ 'lower_bound': 2},
96
+ # Better beta?
97
+ "prior_mlp_hidden_dim": {'distribution': 'meta_trunc_norm_log_scaled', 'max_mean': 130, 'min_mean': 5,
98
+ 'round': True, 'lower_bound': 4},
99
+
100
+ "prior_mlp_dropout_prob": {'distribution': 'meta_beta', 'scale': 0.9, 'min': 0.1, 'max': 5.0},
101
+ # This mustn't be too high since activations get too large otherwise
102
+
103
+ "noise_std": {'distribution': 'meta_trunc_norm_log_scaled', 'max_mean': .3, 'min_mean': 0.0001, 'round': False,
104
+ 'lower_bound': 0.0},
105
+ "init_std": {'distribution': 'meta_trunc_norm_log_scaled', 'max_mean': 10.0, 'min_mean': 0.01, 'round': False,
106
+ 'lower_bound': 0.0},
107
+ "num_causes": {'distribution': 'meta_trunc_norm_log_scaled', 'max_mean': 12, 'min_mean': 1, 'round': True,
108
+ 'lower_bound': 1},
109
+ "is_causal": {'distribution': 'meta_choice', 'choice_values': [True, False]},
110
+ "pre_sample_weights": {'distribution': 'meta_choice', 'choice_values': [True, False]},
111
+ "y_is_effect": {'distribution': 'meta_choice', 'choice_values': [True, False]},
112
+ "prior_mlp_activations": {'distribution': 'meta_choice_mixed', 'choice_values': [
113
+ torch.nn.Tanh
114
+ , torch.nn.ReLU
115
+ , torch.nn.Identity
116
+ , lambda : torch.nn.LeakyReLU(negative_slope=0.1)
117
+ , torch.nn.ELU
118
+ ]},
119
+ "block_wise_dropout": {'distribution': 'meta_choice', 'choice_values': [True, False]},
120
+ "sort_features": {'distribution': 'meta_choice', 'choice_values': [True, False]},
121
+ "in_clique": {'distribution': 'meta_choice', 'choice_values': [True, False]},
122
+ }
123
+
124
+ return diff_causal
125
+
126
+ def get_diff_prior_bag():
127
+ """"
128
+ Returns the configuration parameters for a GP and MLP / Causal mixture.
129
+ """
130
+ diff_prior_bag = {
131
+ 'prior_bag_exp_weights_1': {'distribution': 'uniform', 'min': 100000., 'max': 100001.},
132
+ # MLP Weight (Biased, since MLP works better, 1.0 is weight for prior number 0)
133
+ }
134
+
135
+ return diff_prior_bag
136
+
137
+ def get_diff_config():
138
+ """"
139
+ Returns the configuration parameters for a differentiable wrapper around GP and MLP / Causal mixture priors.
140
+ """
141
+ diff_prior_bag = get_diff_prior_bag()
142
+ diff_causal = get_diff_causal()
143
+ diff_gp = get_diff_gp()
144
+ diff_flex = get_diff_flex()
145
+
146
+ config_diff = {'differentiable_hyperparameters': {**diff_prior_bag, **diff_causal, **diff_gp, **diff_flex}}
147
+
148
+ return config_diff
149
+
150
+
151
+ def sample_differentiable(config):
152
+ """"
153
+ Returns sampled hyperparameters from a differentiable wrapper, that is it makes a non-differentiable out of
154
+ differentiable.
155
+ """
156
+ # config is a dict of dicts, dicts that have a 'distribution' key are treated as distributions to be sampled
157
+ result = deepcopy(config)
158
+ del result['differentiable_hyperparameters']
159
+
160
+ for k, v in config['differentiable_hyperparameters'].items():
161
+ s_indicator, s_hp = DifferentiableHyperparameter(**v, embedding_dim=None,
162
+ device=None)() # both of these are actually not used to the best of my knowledge
163
+ result[k] = s_hp
164
+
165
+ return result
166
+
167
+ def list_all_hps_in_nested(config):
168
+ """"
169
+ Returns a list of hyperparameters from a neszed dict of hyperparameters.
170
+ """
171
+
172
+ if isinstance(config, CSH.Hyperparameter):
173
+ return [config]
174
+ elif isinstance(config, dict):
175
+ result = []
176
+ for k, v in config.items():
177
+ result += list_all_hps_in_nested(v)
178
+ return result
179
+ else:
180
+ return []
181
+
182
+ def create_configspace_from_hierarchical(config):
183
+ cs = CS.ConfigurationSpace()
184
+ for hp in list_all_hps_in_nested(config):
185
+ cs.add_hyperparameter(hp)
186
+ return cs
187
+
188
+ def fill_in_configsample(config, configsample):
189
+ # config is our dict that defines config distribution
190
+ # configsample is a CS.Configuration
191
+ hierarchical_configsample = deepcopy(config)
192
+ for k, v in config.items():
193
+ if isinstance(v, CSH.Hyperparameter):
194
+ hierarchical_configsample[k] = configsample[v.name]
195
+ elif isinstance(v, dict):
196
+ hierarchical_configsample[k] = fill_in_configsample(v, configsample)
197
+ return hierarchical_configsample
198
+
199
+
200
+ def evaluate_hypers(config, sample_diff_hps=False):
201
+ """"
202
+ Samples a hyperparameter configuration from a sampleable configuration (can be used in HP search).
203
+ """
204
+ if sample_diff_hps:
205
+ # I do a deepcopy here, such that the config stays the same and can still be used with diff. hps
206
+ config = deepcopy(config)
207
+ replace_differentiable_distributions(config)
208
+ cs = create_configspace_from_hierarchical(config)
209
+ cs_sample = cs.sample_configuration()
210
+ return fill_in_configsample(config, cs_sample)
TabPFN/scripts/tabular_baselines.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from catboost import CatBoostClassifier, Pool
2
+
3
+ import math
4
+
5
+ from sklearn.impute import SimpleImputer
6
+
7
+ import xgboost as xgb
8
+ from sklearn import neighbors
9
+ from sklearn.gaussian_process import GaussianProcessClassifier
10
+ from sklearn.gaussian_process.kernels import RBF
11
+ import numpy as np
12
+
13
+ from scripts import tabular_metrics
14
+ import pandas as pd
15
+
16
+ from sklearn.linear_model import LogisticRegression
17
+ from sklearn.model_selection import cross_val_score
18
+ import time
19
+
20
+ from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval, rand
21
+ from sklearn.compose import ColumnTransformer
22
+ from sklearn.preprocessing import OneHotEncoder
23
+ from sklearn.preprocessing import MinMaxScaler
24
+
25
+ import autosklearn.classification
26
+
27
+ CV = 5
28
+ MULTITHREAD = 1 # Number of threads baselines are able to use at most
29
+ param_grid, param_grid_hyperopt = {}, {}
30
+
31
+ def get_scoring_direction(metric_used):
32
+ # Not needed
33
+ if metric_used == tabular_metrics.auc_metric:
34
+ return -1
35
+ elif metric_used == tabular_metrics.cross_entropy:
36
+ return 1
37
+ else:
38
+ raise Exception('No scoring string found for metric')
39
+
40
+ def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
41
+ if metric_used == tabular_metrics.auc_metric:
42
+ if usage == 'sklearn_cv':
43
+ return 'roc_auc_ovo'
44
+ elif usage == 'autogluon':
45
+ return 'log_loss' # Autogluon crashes when using 'roc_auc' with some datasets usning logloss gives better scores;
46
+ # We might be able to fix this, but doesn't work out of box.
47
+ # File bug report? Error happens with dataset robert and fabert
48
+ if multiclass:
49
+ return 'roc_auc_ovo_macro'
50
+ else:
51
+ return 'roc_auc'
52
+ elif usage == 'autosklearn':
53
+ if multiclass:
54
+ return autosklearn.metrics.log_loss # roc_auc only works for binary, use logloss instead
55
+ else:
56
+ return autosklearn.metrics.roc_auc
57
+ elif usage == 'catboost':
58
+ return 'MultiClass' # Effectively LogLoss, ROC not available
59
+ elif usage == 'xgb':
60
+ return 'logloss'
61
+ return 'roc_auc'
62
+ elif metric_used == tabular_metrics.cross_entropy:
63
+ if usage == 'sklearn_cv':
64
+ return 'neg_log_loss'
65
+ elif usage == 'autogluon':
66
+ return 'log_loss'
67
+ elif usage == 'autosklearn':
68
+ return autosklearn.metrics.log_loss
69
+ elif usage == 'catboost':
70
+ return 'MultiClass' # Effectively LogLoss
71
+ return 'logloss'
72
+ else:
73
+ raise Exception('No scoring string found for metric')
74
+
75
+ def eval_f(params, clf_, x, y, metric_used, start_time, max_time):
76
+ if time.time() - start_time > max_time:
77
+ return np.nan
78
+ scores = cross_val_score(clf_(**params), x, y, cv=CV, scoring=get_scoring_string(metric_used))
79
+
80
+ return -np.nanmean(scores)
81
+
82
+ def preprocess_impute(x, y, test_x, test_y, impute, one_hot, standardize, cat_features=[]):
83
+ import warnings
84
+ def warn(*args, **kwargs):
85
+ pass
86
+
87
+ warnings.warn = warn
88
+
89
+ x, y, test_x, test_y = x.cpu().numpy(), y.cpu().long().numpy(), test_x.cpu().numpy(), test_y.cpu().long().numpy()
90
+
91
+ if impute:
92
+ imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
93
+ imp_mean.fit(x)
94
+ x, test_x = imp_mean.transform(x), imp_mean.transform(test_x)
95
+
96
+ if one_hot:
97
+ def make_pd_from_np(x):
98
+ data = pd.DataFrame(x)
99
+ for c in cat_features:
100
+ data.iloc[:, c] = data.iloc[:, c].astype('int')
101
+ return data
102
+ x, test_x = make_pd_from_np(x), make_pd_from_np(test_x)
103
+ transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_features)], remainder="passthrough")
104
+ transformer.fit(x)
105
+ x, test_x = transformer.transform(x), transformer.transform(test_x)
106
+
107
+ if standardize:
108
+ scaler = MinMaxScaler()
109
+ scaler.fit(x)
110
+ x, test_x = scaler.transform(x), scaler.transform(test_x)
111
+
112
+ return x, y, test_x, test_y
113
+
114
+ ## Auto Gluon
115
+ def autogluon_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
116
+ from autogluon.tabular import TabularPredictor # Inside function so package can be sued without installation
117
+ x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
118
+ , one_hot=False
119
+ , cat_features=cat_features
120
+ , impute=False
121
+ , standardize=False)
122
+ train_data = pd.DataFrame(np.concatenate([x, y[:, np.newaxis]], 1))
123
+ test_data = pd.DataFrame(np.concatenate([test_x, test_y[:, np.newaxis]], 1))
124
+
125
+ # AutoGluon automatically infers datatypes, we don't specify the categorical labels
126
+ predictor = TabularPredictor(
127
+ label=train_data.columns[-1],
128
+ eval_metric=get_scoring_string(metric_used, usage='autogluon', multiclass=(len(np.unique(y)) > 2)),
129
+ problem_type='multiclass' if len(np.unique(y)) > 2 else 'binary'
130
+ ## seed=int(y[:].sum()) doesn't accept seed
131
+ ).fit(
132
+ train_data=train_data,
133
+ time_limit=max_time,
134
+ presets=['best_quality']
135
+ # The seed is deterministic but varies for each dataset and each split of it
136
+ )
137
+
138
+ pred = predictor.predict_proba(test_data, as_multiclass=True).values
139
+
140
+ metric = metric_used(test_y, pred)
141
+
142
+ return metric, pred, predictor.fit_summary()
143
+
144
+ ## AUTO Sklearn
145
+ def autosklearn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
146
+ return autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=max_time, version=1)
147
+
148
+ from autosklearn.experimental.askl2 import AutoSklearn2Classifier
149
+ from autosklearn.classification import AutoSklearnClassifier
150
+ def autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300, version=2):
151
+ x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
152
+ , one_hot=False
153
+ , cat_features=cat_features
154
+ , impute=False
155
+ , standardize=False)
156
+
157
+ def make_pd_from_np(x):
158
+ data = pd.DataFrame(x)
159
+ for c in cat_features:
160
+ data.iloc[:, c] = data.iloc[:, c].astype('category')
161
+ return data
162
+
163
+ x = make_pd_from_np(x)
164
+ test_x = make_pd_from_np(test_x)
165
+
166
+ clf_ = AutoSklearn2Classifier if version == 2 else AutoSklearnClassifier
167
+ clf = clf_(time_left_for_this_task=max_time,
168
+ memory_limit=4000,
169
+ n_jobs=MULTITHREAD,
170
+ seed=int(y[:].sum()),
171
+ # The seed is deterministic but varies for each dataset and each split of it
172
+ metric=get_scoring_string(metric_used, usage='autosklearn', multiclass=len(np.unique(y)) > 2))
173
+
174
+ # fit model to data
175
+ clf.fit(x, y)
176
+
177
+ pred = clf.predict_proba(test_x)
178
+ metric = metric_used(test_y, pred)
179
+
180
+ return metric, pred, None
181
+
182
+ param_grid_hyperopt['logistic'] = {
183
+ 'penalty': hp.choice('penalty', ['l1', 'l2', 'none'])
184
+ , 'max_iter': hp.randint('max_iter', [50, 500])
185
+ , 'fit_intercept': hp.choice('fit_intercept', [True, False])
186
+ , 'C': hp.loguniform('C', -5, math.log(5.0))} # 'normalize': [False],
187
+
188
+ def logistic_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
189
+ x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
190
+ , one_hot=True, impute=True, standardize=True
191
+ , cat_features=cat_features)
192
+
193
+ def clf_(**params):
194
+ return LogisticRegression(solver='saga', tol=1e-4, n_jobs=1, **params)
195
+
196
+ start_time = time.time()
197
+
198
+ def stop(trial):
199
+ return time.time() - start_time > max_time, []
200
+
201
+ best = fmin(
202
+ fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
203
+ space=param_grid_hyperopt['logistic'],
204
+ algo=rand.suggest,
205
+ rstate=np.random.RandomState(int(y[:].sum())),
206
+ early_stop_fn=stop,
207
+ # The seed is deterministic but varies for each dataset and each split of it
208
+ max_evals=10000)
209
+ best = space_eval(param_grid_hyperopt['logistic'], best)
210
+
211
+ clf = clf_(**best)
212
+ clf.fit(x, y)
213
+
214
+ pred = clf.predict_proba(test_x)
215
+ metric = metric_used(test_y, pred)
216
+
217
+ return metric, pred, best
218
+
219
+ ## KNN
220
+ param_grid_hyperopt['knn'] = {'n_neighbors': hp.randint('n_neighbors', 1,16)
221
+ }
222
+ def knn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
223
+ x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
224
+ one_hot=True, impute=True, standardize=True,
225
+ cat_features=cat_features)
226
+
227
+ def clf_(**params):
228
+ return neighbors.KNeighborsClassifier(n_jobs=1, **params)
229
+
230
+ start_time = time.time()
231
+
232
+ def stop(trial):
233
+ return time.time() - start_time > max_time, []
234
+
235
+ best = fmin(
236
+ fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
237
+ space=param_grid_hyperopt['knn'],
238
+ algo=rand.suggest,
239
+ rstate=np.random.RandomState(int(y[:].sum())),
240
+ early_stop_fn=stop,
241
+ # The seed is deterministic but varies for each dataset and each split of it
242
+ max_evals=10000)
243
+ best = space_eval(param_grid_hyperopt['knn'], best)
244
+
245
+ clf = clf_(**best)
246
+ clf.fit(x, y)
247
+
248
+ pred = clf.predict_proba(test_x)
249
+ metric = metric_used(test_y, pred)
250
+
251
+ return metric, pred, best
252
+
253
+ ## GP
254
+ param_grid_hyperopt['gp'] = {
255
+ 'params_y_scale': hp.loguniform('params_y_scale', math.log(0.05), math.log(5.0)),
256
+ 'params_length_scale': hp.loguniform('params_length_scale', math.log(0.1), math.log(1.0)),
257
+ 'n_jobs': hp.choice('njobs', [1])
258
+ }
259
+ def gp_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
260
+ x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
261
+ one_hot=True, impute=True, standardize=True,
262
+ cat_features=cat_features)
263
+
264
+ def clf_(params_y_scale,params_length_scale, **params):
265
+ return GaussianProcessClassifier(kernel= params_y_scale * RBF(params_length_scale), **params)
266
+
267
+ start_time = time.time()
268
+ def stop(trial):
269
+ return time.time() - start_time > max_time, []
270
+
271
+
272
+ best = fmin(
273
+ fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
274
+ space=param_grid_hyperopt['gp'],
275
+ algo=rand.suggest,
276
+ rstate=np.random.RandomState(int(y[:].sum())),
277
+ early_stop_fn=stop,
278
+ # The seed is deterministic but varies for each dataset and each split of it
279
+ max_evals=1000)
280
+ best = space_eval(param_grid_hyperopt['gp'], best)
281
+
282
+ clf = clf_(**best)
283
+ clf.fit(x, y)
284
+
285
+ pred = clf.predict_proba(test_x)
286
+ metric = metric_used(test_y, pred)
287
+
288
+ return metric, pred, best
289
+
290
+
291
+ # Catboost
292
+ # Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf
293
+
294
+ param_grid_hyperopt['catboost'] = {
295
+ 'learning_rate': hp.loguniform('learning_rate', math.log(math.pow(math.e, -5)), math.log(1)),
296
+ 'random_strength': hp.randint('random_strength', 1, 20),
297
+ 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', math.log(1), math.log(10)),
298
+ 'bagging_temperature': hp.uniform('bagging_temperature', 0., 1),
299
+ 'leaf_estimation_iterations': hp.randint('leaf_estimation_iterations', 1, 20),
300
+ 'iterations': hp.randint('iterations', 100, 4000), # This is smaller than in paper, 4000 leads to ram overusage
301
+ }
302
+
303
+ def catboost_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
304
+ print(x)
305
+
306
+ x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
307
+ , one_hot=False
308
+ , cat_features=cat_features
309
+ , impute=False
310
+ , standardize=False)
311
+
312
+ # Nans in categorical features must be encoded as separate class
313
+ x[:, cat_features], test_x[:, cat_features] = np.nan_to_num(x[:, cat_features], -1), np.nan_to_num(
314
+ test_x[:, cat_features], -1)
315
+
316
+ def make_pd_from_np(x):
317
+ data = pd.DataFrame(x)
318
+ for c in cat_features:
319
+ data.iloc[:, c] = data.iloc[:, c].astype('int')
320
+ return data
321
+
322
+ x = make_pd_from_np(x)
323
+ test_x = make_pd_from_np(test_x)
324
+
325
+ def clf_(**params):
326
+ return CatBoostClassifier(
327
+ loss_function=get_scoring_string(metric_used, usage='catboost'),
328
+ thread_count = MULTITHREAD,
329
+ used_ram_limit='4gb',
330
+ random_seed=int(y[:].sum()),
331
+ logging_level='Silent',
332
+ cat_features=cat_features,
333
+ **params)
334
+
335
+ start_time = time.time()
336
+ def stop(trial):
337
+ return time.time() - start_time > max_time, []
338
+
339
+ best = fmin(
340
+ fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
341
+ space=param_grid_hyperopt['catboost'],
342
+ algo=rand.suggest,
343
+ rstate=np.random.RandomState(int(y[:].sum())),
344
+ early_stop_fn=stop,
345
+ # The seed is deterministic but varies for each dataset and each split of it
346
+ max_evals=1000)
347
+ best = space_eval(param_grid_hyperopt['catboost'], best)
348
+
349
+ clf = clf_(**best)
350
+ clf.fit(x, y)
351
+
352
+ pred = clf.predict_proba(test_x)
353
+ metric = metric_used(test_y, pred)
354
+
355
+ return metric, pred, best
356
+
357
+
358
+ # XGBoost
359
+ # Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf
360
+ param_grid_hyperopt['xgb'] = {
361
+ 'learning_rate': hp.loguniform('learning_rate', -7, math.log(1)),
362
+ 'max_depth': hp.randint('max_depth', 1, 10),
363
+ 'subsample': hp.uniform('subsample', 0.2, 1),
364
+ 'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1),
365
+ 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1),
366
+ 'min_child_weight': hp.loguniform('min_child_weight', -16, 5),
367
+ 'alpha': hp.loguniform('alpha', -16, 2),
368
+ 'lambda': hp.loguniform('lambda', -16, 2),
369
+ 'gamma': hp.loguniform('gamma', -16, 2),
370
+ 'n_estimators': hp.randint('n_estimators', 100, 4000), # This is smaller than in paper
371
+ }
372
+
373
+ def xgb_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
374
+ # XGB Documentation:
375
+ # XGB handles categorical data appropriately without using One Hot Encoding, categorical features are experimetal
376
+ # XGB handles missing values appropriately without imputation
377
+
378
+ x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
379
+ , one_hot=False
380
+ , cat_features=cat_features
381
+ , impute=False
382
+ , standardize=False)
383
+
384
+ def clf_(**params):
385
+ return xgb.XGBClassifier(use_label_encoder=False
386
+ , nthread=1
387
+ , **params
388
+ , eval_metric=get_scoring_string(metric_used, usage='xgb') # AUC not implemented
389
+ )
390
+
391
+ start_time = time.time()
392
+ def stop(trial):
393
+ return time.time() - start_time > max_time, []
394
+
395
+ best = fmin(
396
+ fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
397
+ space=param_grid_hyperopt['xgb'],
398
+ algo=rand.suggest,
399
+ rstate=np.random.RandomState(int(y[:].sum())),
400
+ early_stop_fn=stop,
401
+ # The seed is deterministic but varies for each dataset and each split of it
402
+ max_evals=1000)
403
+ best = space_eval(param_grid_hyperopt['xgb'], best)
404
+
405
+ clf = clf_(**best)
406
+ clf.fit(x, y)
407
+
408
+ pred = clf.predict_proba(test_x)
409
+ metric = metric_used(test_y, pred)
410
+
411
+ return metric, pred, best
412
+
413
+
414
+ clf_dict = {'gp': gp_metric
415
+ , 'knn': knn_metric
416
+ , 'catboost': catboost_metric
417
+ , 'xgb': xgb_metric
418
+ , 'logistic': logistic_metric
419
+ , 'autosklearn': autosklearn_metric
420
+ , 'autosklearn2': autosklearn2_metric
421
+ , 'autogluon': autogluon_metric}
TabPFN/scripts/tabular_evaluation.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from tqdm import tqdm
6
+ import random
7
+ import numpy as np
8
+
9
+ from torch import nn
10
+
11
+ from utils import torch_nanmean
12
+ from datasets import *
13
+ from model_builder import load_model
14
+ from scripts.tabular_baselines import get_scoring_string
15
+ from scripts import tabular_metrics
16
+ from scripts.transformer_prediction_interface import *
17
+ from scripts.baseline_prediction_interface import *
18
+ """
19
+ ===============================
20
+ PUBLIC FUNCTIONS FOR EVALUATION
21
+ ===============================
22
+ """
23
+
24
+
25
+ def eval_model(i, e, valid_datasets, test_datasets, eval_positions, bptt, add_name, base_path, device='cpu', eval_addition='', **kwargs):
26
+ metrics_test, config_sample, model_path = eval_model_on_ds(i, e, test_datasets, eval_positions, bptt, add_name, base_path, device=device, eval_addition=eval_addition, **kwargs)
27
+ metrics_valid, _, _ = eval_model_on_ds(i, e, valid_datasets, eval_positions, bptt, add_name, base_path, device=device, eval_addition=eval_addition, **kwargs)
28
+ return {'mean_auc_test': metrics_test['mean_roc_at_1000'], 'mean_auc_valid': metrics_valid['mean_roc_at_1000'], 'mean_ce_test': metrics_test['mean_ce_at_1000'], 'mean_ce_valid': metrics_valid['mean_ce_at_1000'], 'config_sample': config_sample, 'model_path': model_path}
29
+
30
+ def eval_model_on_ds(i, e, valid_datasets, eval_positions, bptt, add_name, base_path, device='cpu', eval_addition='', **kwargs):
31
+
32
+ # How to use: evaluate_without_fitting(i,0,valid_datasets, [1024], 100000, add_name=model_string, base_path=base_path,)
33
+ def check_file(e):
34
+ model_file = f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{e}.cpkt'
35
+ model_path = os.path.join(base_path, model_file)
36
+ # print('Evaluate ', model_path)
37
+ results_file = os.path.join(base_path,
38
+ f'models_diff/prior_diff_real_results{add_name}_n_{i}_epoch_{e}_{eval_addition}.pkl')
39
+ if not Path(model_path).is_file(): # or Path(results_file).is_file():
40
+ # print('checkpoint exists: ', Path(model_file).is_file(), ', results are written:', Path(results_file).is_file())
41
+ return None, None, None
42
+ return model_file, model_path, results_file
43
+
44
+ if e == -1: # use last checkpoint, if e == -1
45
+ for e_ in range(100, -1, -1):
46
+ model_file_, model_path_, results_file_ = check_file(e_)
47
+ if model_file_ is not None:
48
+ e = e_
49
+ model_file, model_path, results_file = model_file_, model_path_, results_file_
50
+ break
51
+ else:
52
+ model_file, model_path, results_file = check_file(e)
53
+
54
+ model, config_sample = load_model(base_path, model_file, device, None, verbose=False)
55
+ print(model[2].style_encoder)
56
+
57
+ params = {'max_features': config_sample['num_features']
58
+ , 'rescale_features': config_sample["normalize_by_used_features"]
59
+ , 'normalize_to_ranking': config_sample["normalize_to_ranking"]
60
+ , 'normalize_with_sqrt': config_sample.get("normalize_with_sqrt", False)
61
+ }
62
+ metrics_valid = evaluate(datasets=valid_datasets, model=model[2], method='transformer', device=device, overwrite=True,
63
+ extend_features=True
64
+ # just removed the style keyword but transformer is trained with style, just empty
65
+ , save=False
66
+ , metric_used=tabular_metrics.cross_entropy
67
+ , return_tensor=True
68
+ , verbose=False
69
+ , eval_positions=eval_positions
70
+ , bptt=bptt
71
+ , base_path=None
72
+ , inference_mode=True
73
+ , **params
74
+ , **kwargs)
75
+
76
+ tabular_metrics.calculate_score_per_method(tabular_metrics.auc_metric, 'roc', metrics_valid, valid_datasets, eval_positions)
77
+ tabular_metrics.calculate_score_per_method(tabular_metrics.cross_entropy, 'ce', metrics_valid, valid_datasets, eval_positions)
78
+
79
+ return metrics_valid, config_sample, model_path
80
+
81
+
82
+ def evaluate(datasets, bptt, eval_positions, metric_used, model
83
+ , verbose=False
84
+ , return_tensor=False
85
+ , **kwargs):
86
+ """
87
+ Evaluates a list of datasets for a model function.
88
+
89
+ :param datasets: List of datasets
90
+ :param bptt: maximum sequence length
91
+ :param eval_positions: List of positions where to evaluate models
92
+ :param verbose: If True, is verbose.
93
+ :param metric_used: Which metric is optimized for.
94
+ :param return_tensor: Wheater to return results as a pytorch.tensor or numpy, this is only relevant for transformer.
95
+ :param kwargs:
96
+ :return:
97
+ """
98
+ overall_result = {'metric_used': get_scoring_string(metric_used)
99
+ , 'bptt': bptt
100
+ , 'eval_positions': eval_positions}
101
+
102
+ aggregated_metric_datasets, num_datasets = torch.tensor(0.0), 0
103
+
104
+ # For each dataset
105
+ for [ds_name, X, y, categorical_feats, _, _] in tqdm.tqdm(datasets, desc='Iterate over datasets') if verbose else datasets:
106
+ dataset_bptt = min(len(X), bptt)
107
+ # if verbose and dataset_bptt < bptt:
108
+ # print(f'Dataset too small for given sequence length, reducing to {len(X)} ({bptt})')
109
+
110
+ aggregated_metric, num = torch.tensor(0.0), 0
111
+ ds_result = {}
112
+
113
+ for eval_position in (eval_positions if verbose else eval_positions):
114
+ eval_position_real = int(dataset_bptt * 0.5) if 2 * eval_position > dataset_bptt else eval_position
115
+ eval_position_bptt = int(eval_position_real * 2.0)
116
+
117
+ r = evaluate_position(X, y, model=model
118
+ , num_classes=len(torch.unique(y))
119
+ , categorical_feats = categorical_feats
120
+ , bptt = eval_position_bptt
121
+ , ds_name=ds_name
122
+ , eval_position = eval_position_real
123
+ , metric_used = metric_used
124
+ ,**kwargs)
125
+
126
+ if r is None:
127
+ continue
128
+
129
+ _, outputs, ys, best_configs, time_used = r
130
+
131
+ if torch.is_tensor(outputs):
132
+ outputs = outputs.to(outputs.device)
133
+ ys = ys.to(outputs.device)
134
+
135
+ ys = ys.T
136
+ ds_result[f'{ds_name}_best_configs_at_{eval_position}'] = best_configs
137
+ ds_result[f'{ds_name}_outputs_at_{eval_position}'] = outputs
138
+ ds_result[f'{ds_name}_ys_at_{eval_position}'] = ys
139
+ ds_result[f'{ds_name}_time_at_{eval_position}'] = time_used
140
+
141
+ new_metric = torch_nanmean(torch.stack([metric_used(ys[i], outputs[i]) for i in range(ys.shape[0])]))
142
+
143
+ if not return_tensor:
144
+ make_scalar = lambda x: float(x.detach().cpu().numpy()) if (torch.is_tensor(x) and (len(x.shape) == 0)) else x
145
+ new_metric = make_scalar(new_metric)
146
+ ds_result = {k: make_scalar(ds_result[k]) for k in ds_result.keys()}
147
+
148
+ lib = torch if return_tensor else np
149
+ if not lib.isnan(new_metric).any():
150
+ aggregated_metric, num = aggregated_metric + new_metric, num + 1
151
+
152
+ overall_result.update(ds_result)
153
+ if num > 0:
154
+ aggregated_metric_datasets, num_datasets = (aggregated_metric_datasets + (aggregated_metric / num)), num_datasets + 1
155
+
156
+ overall_result['mean_metric'] = aggregated_metric_datasets / num_datasets
157
+
158
+ return overall_result
159
+
160
+ """
161
+ ===============================
162
+ INTERNAL HELPER FUNCTIONS
163
+ ===============================
164
+ """
165
+
166
+ def check_file_exists(path):
167
+ """Checks if a pickle file exists. Returns None if not, else returns the unpickled file."""
168
+ if (os.path.isfile(path)):
169
+ print(f'loading results from {path}')
170
+ with open(path, 'rb') as f:
171
+ return np.load(f, allow_pickle=True).tolist()
172
+ return None
173
+
174
+ def generate_valid_split(X, y, bptt, eval_position, split_number=1):
175
+ """Generates a deteministic train-(test/valid) split. Both splits must contain the same classes and all classes in
176
+ the entire datasets. If no such split can be sampled in 7 passes, returns None.
177
+
178
+ :param X: torch tensor, feature values
179
+ :param y: torch tensor, class values
180
+ :param bptt: Number of samples in train + test
181
+ :param eval_position: Number of samples in train, i.e. from which index values are in test
182
+ :param split_number: The split id
183
+ :return:
184
+ """
185
+ done, seed = False, 13
186
+
187
+ torch.manual_seed(split_number)
188
+ perm = torch.randperm(X.shape[0]) if split_number > 1 else torch.arange(0, X.shape[0])
189
+ X, y = X[perm], y[perm]
190
+
191
+ while not done:
192
+ if seed > 20:
193
+ return None, None # No split could be generated in 7 passes, return None
194
+ random.seed(seed)
195
+ i = random.randint(0, len(X) - bptt) if len(X) - bptt > 0 else 0
196
+ y_ = y[i:i + bptt]
197
+
198
+ # Checks if all classes from dataset are contained and classes in train and test are equal (contain same
199
+ # classes) and
200
+ done = len(torch.unique(y_)) == len(torch.unique(y))
201
+ done = done and torch.all(torch.unique(y_) == torch.unique(y))
202
+ done = done and len(torch.unique(y_[:eval_position])) == len(torch.unique(y_[eval_position:]))
203
+ done = done and torch.all(torch.unique(y_[:eval_position]) == torch.unique(y_[eval_position:]))
204
+ seed = seed + 1
205
+
206
+ eval_xs = torch.stack([X[i:i + bptt].clone()], 1)
207
+ eval_ys = torch.stack([y[i:i + bptt].clone()], 1)
208
+
209
+ return eval_xs, eval_ys
210
+
211
+
212
+ def evaluate_position(X, y, categorical_feats, model, bptt
213
+ , eval_position, overwrite, save, base_path, path_interfix, method, ds_name, fetch_only=False
214
+ , max_time=300, split_number=1
215
+ , per_step_normalization=False, **kwargs):
216
+ """
217
+ Evaluates a dataset with a 'bptt' number of training samples.
218
+
219
+ :param X: Dataset X
220
+ :param y: Dataset labels
221
+ :param categorical_feats: Indices of categorical features.
222
+ :param model: Model function
223
+ :param bptt: Sequence length.
224
+ :param eval_position: Number of training samples.
225
+ :param overwrite: Wheater to ove
226
+ :param overwrite: If True, results on disk are overwritten.
227
+ :param save:
228
+ :param path_interfix: Used for constructing path to write on disk.
229
+ :param method: Model name.
230
+ :param ds_name: Datset name.
231
+ :param fetch_only: Wheater to calculate or only fetch results.
232
+ :param per_step_normalization:
233
+ :param kwargs:
234
+ :return:
235
+ """
236
+
237
+ if save:
238
+ path = os.path.join(base_path, f'results/tabular/{path_interfix}/results_{method}_{ds_name}_{eval_position}_{bptt}_{split_number}.npy')
239
+ #log_path =
240
+
241
+ ## Load results if on disk
242
+ if not overwrite:
243
+ result = check_file_exists(path)
244
+ if result is not None:
245
+ if not fetch_only:
246
+ print(f'Loaded saved result for {path}')
247
+ return result
248
+ elif fetch_only:
249
+ print(f'Could not load saved result for {path}')
250
+ return None
251
+
252
+ ## Generate data splits
253
+ eval_xs, eval_ys = generate_valid_split(X, y, bptt, eval_position, split_number=split_number)
254
+ if eval_xs is None:
255
+ return None
256
+ print(f"No dataset could be generated {ds_name} {bptt}")
257
+
258
+ eval_ys = (eval_ys > torch.unique(eval_ys).unsqueeze(0)).sum(axis=1).unsqueeze(-1)
259
+
260
+ start_time = time.time()
261
+
262
+ if isinstance(model, nn.Module): # Two separate predict interfaces for transformer and baselines
263
+ outputs, best_configs = transformer_predict(model, eval_xs, eval_ys, eval_position, categorical_feats=categorical_feats, **kwargs), None
264
+ else:
265
+ _, outputs, best_configs = baseline_predict(model, eval_xs, eval_ys, categorical_feats
266
+ , eval_pos=eval_position
267
+ , max_time=max_time, **kwargs)
268
+
269
+ eval_ys = eval_ys[eval_position:]
270
+ if outputs is None:
271
+ return None
272
+
273
+ if torch.is_tensor(outputs): # Transfers data to cpu for saving
274
+ outputs = outputs.cpu()
275
+ eval_ys = eval_ys.cpu()
276
+
277
+ ds_result = None, outputs, eval_ys, best_configs, time.time() - start_time
278
+
279
+ if save:
280
+ with open(path, 'wb') as f:
281
+ np.save(f, ds_result)
282
+ print(f'saved results to {path}')
283
+
284
+ return ds_result
TabPFN/scripts/tabular_metrics.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ===============================
3
+ Metrics calculation
4
+ ===============================
5
+ Includes a few metric as well as functions composing metrics on results files.
6
+
7
+ """
8
+
9
+
10
+
11
+ import numpy as np
12
+ import torch
13
+ from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, average_precision_score
14
+ from scipy.stats import rankdata
15
+ import pandas as pd
16
+
17
+ """
18
+ ===============================
19
+ Metrics calculation
20
+ ===============================
21
+ """
22
+ def auc_metric(target, pred, multi_class='ovo', numpy=False):
23
+ lib = np if numpy else torch
24
+ try:
25
+ if not numpy:
26
+ target = torch.tensor(target) if not torch.is_tensor(target) else target
27
+ pred = torch.tensor(pred) if not torch.is_tensor(pred) else pred
28
+ if len(lib.unique(target)) > 2:
29
+ if not numpy:
30
+ return torch.tensor(roc_auc_score(target, pred, multi_class=multi_class))
31
+ return roc_auc_score(target, pred, multi_class=multi_class)
32
+ else:
33
+ if len(pred.shape) == 2:
34
+ pred = pred[:, 1]
35
+ if not numpy:
36
+ return torch.tensor(roc_auc_score(target, pred))
37
+ return roc_auc_score(target, pred)
38
+ except ValueError as e:
39
+ print(e)
40
+ return np.nan
41
+
42
+ def accuracy_metric(target, pred):
43
+ target = torch.tensor(target) if not torch.is_tensor(target) else target
44
+ pred = torch.tensor(pred) if not torch.is_tensor(pred) else pred
45
+ if len(torch.unique(target)) > 2:
46
+ return torch.tensor(accuracy_score(target, torch.argmax(pred, -1)))
47
+ else:
48
+ return torch.tensor(accuracy_score(target, pred[:, 1] > 0.5))
49
+
50
+ def average_precision_metric(target, pred):
51
+ target = torch.tensor(target) if not torch.is_tensor(target) else target
52
+ pred = torch.tensor(pred) if not torch.is_tensor(pred) else pred
53
+ if len(torch.unique(target)) > 2:
54
+ return torch.tensor(average_precision_score(target, torch.argmax(pred, -1)))
55
+ else:
56
+ return torch.tensor(average_precision_score(target, pred[:, 1] > 0.5))
57
+
58
+ def balanced_accuracy_metric(target, pred):
59
+ target = torch.tensor(target) if not torch.is_tensor(target) else target
60
+ pred = torch.tensor(pred) if not torch.is_tensor(pred) else pred
61
+ if len(torch.unique(target)) > 2:
62
+ return torch.tensor(balanced_accuracy_score(target, torch.argmax(pred, -1)))
63
+ else:
64
+ return torch.tensor(balanced_accuracy_score(target, pred[:, 1] > 0.5))
65
+
66
+ def cross_entropy(target, pred):
67
+ target = torch.tensor(target) if not torch.is_tensor(target) else target
68
+ pred = torch.tensor(pred) if not torch.is_tensor(pred) else pred
69
+ if len(torch.unique(target)) > 2:
70
+ ce = torch.nn.CrossEntropyLoss()
71
+ return ce(pred.float(), target.long())
72
+ else:
73
+ bce = torch.nn.BCELoss()
74
+ return bce(pred[:, 1].float(), target.float())
75
+
76
+ def time_metric():
77
+ """
78
+ Dummy function, will just be used as a handler.
79
+ """
80
+ pass
81
+
82
+ def count_metric(x, y):
83
+ """
84
+ Dummy function, returns one count per dataset.
85
+ """
86
+ return 1
87
+
88
+ """
89
+ ===============================
90
+ Metrics composition
91
+ ===============================
92
+ """
93
+ def calculate_score_per_method(metric, name:str, global_results:dict, ds:list, eval_positions:list, aggregator:str='mean'):
94
+ """
95
+ Calculates the metric given by 'metric' and saves it under 'name' in the 'global_results'
96
+
97
+ :param metric: Metric function
98
+ :param name: Name of metric in 'global_results'
99
+ :param global_results: Dicrtonary containing the results for current method for a collection of datasets
100
+ :param ds: Dataset to calculate metrics on, a list of dataset properties
101
+ :param eval_positions: List of positions to calculate metrics on
102
+ :param aggregator: Specifies way to aggregate results across evaluation positions
103
+ :return:
104
+ """
105
+ aggregator_f = np.nanmean if aggregator == 'mean' else np.nansum
106
+ for pos in eval_positions:
107
+ valid_positions = 0
108
+ for d in ds:
109
+ if f'{d[0]}_outputs_at_{pos}' in global_results:
110
+ preds = global_results[f'{d[0]}_outputs_at_{pos}']
111
+ y = global_results[f'{d[0]}_ys_at_{pos}']
112
+
113
+ preds, y = preds.detach().cpu().numpy() if torch.is_tensor(
114
+ preds) else preds, y.detach().cpu().numpy() if torch.is_tensor(y) else y
115
+
116
+ try:
117
+ if metric == time_metric:
118
+ global_results[f'{d[0]}_{name}_at_{pos}'] = global_results[f'{d[0]}_time_at_{pos}']
119
+ valid_positions = valid_positions + 1
120
+ else:
121
+ global_results[f'{d[0]}_{name}_at_{pos}'] = aggregator_f(
122
+ [metric(y[split], preds[split]) for split in range(y.shape[0])])
123
+ valid_positions = valid_positions + 1
124
+ except Exception as err:
125
+ print(f'Error calculating metric with {err}, {type(err)} at {d[0]} {pos} {name}')
126
+ global_results[f'{d[0]}_{name}_at_{pos}'] = np.nan
127
+ else:
128
+ global_results[f'{d[0]}_{name}_at_{pos}'] = np.nan
129
+
130
+ if valid_positions > 0:
131
+ global_results[f'{aggregator}_{name}_at_{pos}'] = aggregator_f([global_results[f'{d[0]}_{name}_at_{pos}'] for d in ds])
132
+ else:
133
+ global_results[f'{aggregator}_{name}_at_{pos}'] = np.nan
134
+
135
+ for d in ds:
136
+ metrics = [global_results[f'{d[0]}_{name}_at_{pos}'] for pos in eval_positions]
137
+ metrics = [m for m in metrics if not np.isnan(m)]
138
+ global_results[f'{d[0]}_{aggregator}_{name}'] = aggregator_f(metrics) if len(metrics) > 0 else np.nan
139
+
140
+ metrics = [global_results[f'{aggregator}_{name}_at_{pos}'] for pos in eval_positions]
141
+ metrics = [m for m in metrics if not np.isnan(m)]
142
+ global_results[f'{aggregator}_{name}'] = aggregator_f(metrics) if len(metrics) > 0 else np.nan
143
+
144
+
145
+ def calculate_score(metric, name, global_results, ds, eval_positions, aggregator='mean', limit_to=''):
146
+ """
147
+ Calls calculate_metrics_by_method with a range of methods. See arguments of that method.
148
+ :param limit_to: This method will not get metric calculations.
149
+ """
150
+ for m in global_results:
151
+ if limit_to not in m:
152
+ continue
153
+ calculate_score_per_method(metric, name, global_results[m], ds, eval_positions, aggregator=aggregator)
154
+
155
+
156
+ def make_metric_matrix(global_results, methods, pos, name, ds):
157
+ result = []
158
+ for m in global_results:
159
+ result += [[global_results[m][d[0] + '_' + name + '_at_' + str(pos)] for d in ds]]
160
+ result = np.array(result)
161
+ result = pd.DataFrame(result.T, index=[d[0] for d in ds], columns=[k[:-8] for k in list(global_results.keys())])
162
+
163
+ matrix_means, matrix_stds = [], []
164
+
165
+ for method in methods:
166
+ matrix_means += [result.iloc[:, [(method) in c for c in result.columns]].mean(axis=1)]
167
+ matrix_stds += [result.iloc[:, [(method) in c for c in result.columns]].std(axis=1)]
168
+
169
+ matrix_means = pd.DataFrame(matrix_means, index=methods).T
170
+ matrix_stds = pd.DataFrame(matrix_stds, index=methods).T
171
+
172
+ return matrix_means, matrix_stds
173
+
174
+
175
+ def make_ranks_and_wins_table(matrix):
176
+ for dss in matrix.T:
177
+ matrix.loc[dss] = rankdata(-matrix.round(3).loc[dss])
178
+ ranks_acc = matrix.mean()
179
+ wins_acc = (matrix == 1).sum()
180
+
181
+ return ranks_acc, wins_acc
TabPFN/scripts/transformer_prediction_interface.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import random
3
+
4
+ from torch.utils.checkpoint import checkpoint
5
+
6
+ from utils import normalize_data, to_ranking_low_mem, remove_outliers
7
+ from priors.utils import normalize_by_used_features_f
8
+ from utils import NOP
9
+
10
+ from sklearn.preprocessing import PowerTransformer, QuantileTransformer, RobustScaler
11
+
12
+ from notebook_utils import CustomUnpickler
13
+
14
+ import numpy as np
15
+ from sklearn.base import BaseEstimator, ClassifierMixin
16
+ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
17
+ from sklearn.utils.multiclass import check_classification_targets
18
+ from sklearn.utils import column_or_1d
19
+ from pathlib import Path
20
+ from model_builder import load_model
21
+ import os
22
+
23
+ def load_model_workflow(i, e, add_name, base_path, device='cpu', eval_addition=''):
24
+ """
25
+ Workflow for loading a model and setting appropriate parameters for diffable hparam tuning.
26
+
27
+ :param i:
28
+ :param e:
29
+ :param eval_positions_valid:
30
+ :param add_name:
31
+ :param base_path:
32
+ :param device:
33
+ :param eval_addition:
34
+ :return:
35
+ """
36
+ def check_file(e):
37
+ model_file = f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{e}.cpkt'
38
+ model_path = os.path.join(base_path, model_file)
39
+ # print('Evaluate ', model_path)
40
+ results_file = os.path.join(base_path,
41
+ f'models_diff/prior_diff_real_results{add_name}_n_{i}_epoch_{e}_{eval_addition}.pkl')
42
+ if not Path(model_path).is_file(): # or Path(results_file).is_file():
43
+ return None, None, None
44
+ return model_file, model_path, results_file
45
+
46
+ model_file = None
47
+ if e == -1:
48
+ for e_ in range(100, -1, -1):
49
+ model_file_, model_path_, results_file_ = check_file(e_)
50
+ if model_file_ is not None:
51
+ e = e_
52
+ model_file, model_path, results_file = model_file_, model_path_, results_file_
53
+ break
54
+ else:
55
+ model_file, model_path, results_file = check_file(e)
56
+
57
+ if model_file is None:
58
+ print('No checkpoint found')
59
+ return None
60
+
61
+ print(f'Loading {model_file}')
62
+
63
+ model, c = load_model(base_path, model_file, device, eval_positions=[], verbose=False)
64
+
65
+ return model, c, results_file
66
+
67
+
68
+ class TabPFNClassifier(BaseEstimator, ClassifierMixin):
69
+
70
+ def __init__(self, device='cpu', base_path='.'):
71
+ # Model file specification (Model name, Epoch)
72
+ model_string = ''
73
+ i, e = '8x_lr0.0003', -1
74
+
75
+ # File which contains result of hyperparameter tuning run: style (i.e. hyperparameters) and a dataframe with results.
76
+ style_file = 'prior_tuning_result.pkl'
77
+
78
+ model, c, results_file = load_model_workflow(i, e, add_name=model_string, base_path=base_path, device=device,
79
+ eval_addition='')
80
+ style, temperature = self.load_result_minimal(style_file, i, e, base_path=base_path)
81
+
82
+ self.device = device
83
+ self.base_path = base_path
84
+ self.model = model
85
+ self.c = c
86
+ self.style = style
87
+ self.temperature = temperature
88
+
89
+ self.max_num_features = self.c['num_features']
90
+ self.max_num_classes = self.c['max_num_classes']
91
+
92
+ def load_result_minimal(self, path, i, e, base_path='.'):
93
+ with open(os.path.join(base_path,path), 'rb') as output:
94
+ _, _, _, style, temperature, optimization_route = CustomUnpickler(output).load()
95
+
96
+ return style, temperature
97
+
98
+ def fit(self, X, y):
99
+ # Check that X and y have correct shape
100
+ X, y = check_X_y(X, y)
101
+ y = self._validate_targets(y)
102
+
103
+ self.X_ = X
104
+ self.y_ = y
105
+
106
+ if X.shape[1] > self.max_num_features:
107
+ raise ValueError("The number of features for this classifier is restricted to ", self.max_num_features)
108
+ if len(np.unique(y)) > self.max_num_classes:
109
+ raise ValueError("The number of classes for this classifier is restricted to ", self.max_num_classes)
110
+
111
+ # Return the classifier
112
+ return self
113
+
114
+ def _validate_targets(self, y):
115
+ y_ = column_or_1d(y, warn=True)
116
+ check_classification_targets(y)
117
+ cls, y = np.unique(y_, return_inverse=True)
118
+ if len(cls) < 2:
119
+ raise ValueError(
120
+ "The number of classes has to be greater than one; got %d class"
121
+ % len(cls)
122
+ )
123
+
124
+ self.classes_ = cls
125
+
126
+ return np.asarray(y, dtype=np.float64, order="C")
127
+
128
+ def predict_proba(self, X):
129
+ # Check is fit had been called
130
+ check_is_fitted(self)
131
+
132
+ # Input validation
133
+ X = check_array(X)
134
+
135
+ X_full = np.concatenate([self.X_, X], axis=0)
136
+ X_full = torch.tensor(X_full, device=self.device).float().unsqueeze(1)
137
+ y_full = np.concatenate([self.y_, self.y_[0] + np.zeros_like(X[:, 0])], axis=0)
138
+ y_full = torch.tensor(y_full, device=self.device).float().unsqueeze(1)
139
+
140
+ eval_pos = self.X_.shape[0]
141
+
142
+ prediction = transformer_predict(self.model[2], X_full, y_full, eval_pos,
143
+ device=self.device,
144
+ style=self.style,
145
+ inference_mode=True,
146
+ N_ensemble_configurations=10,
147
+ softmax_temperature=self.temperature
148
+ , **get_params_from_config(self.c))
149
+ prediction_ = prediction.squeeze(0)
150
+
151
+ return prediction_.detach().cpu().numpy()
152
+
153
+ def predict(self, X, return_winning_probability=False):
154
+ p = self.predict_proba(X)
155
+ y = np.argmax(self.predict_proba(X), axis=-1)
156
+ y = self.classes_.take(np.asarray(y, dtype=np.intp))
157
+ if return_winning_probability:
158
+ return y, p.max(axis=-1)
159
+ return y
160
+
161
+ def transformer_predict(model, eval_xs, eval_ys, eval_position,
162
+ device='cpu',
163
+ max_features=100,
164
+ style=None,
165
+ inference_mode=False,
166
+ num_classes=2,
167
+ extend_features=True,
168
+ normalize_to_ranking=False,
169
+ softmax_temperature=0.0,
170
+ multiclass_decoder='permutation',
171
+ preprocess_transform='mix',
172
+ categorical_feats=[],
173
+ feature_shift_decoder=True,
174
+ N_ensemble_configurations=10,
175
+ average_logits=True,
176
+ normalize_with_sqrt=False, **kwargs):
177
+ """
178
+
179
+ :param model:
180
+ :param eval_xs:
181
+ :param eval_ys: should be classes that are 0-indexed and every class until num_classes-1 is present
182
+ :param eval_position:
183
+ :param rescale_features:
184
+ :param device:
185
+ :param max_features:
186
+ :param style:
187
+ :param inference_mode:
188
+ :param num_classes:
189
+ :param extend_features:
190
+ :param normalize_to_ranking:
191
+ :param softmax_temperature:
192
+ :param multiclass_decoder:
193
+ :param preprocess_transform:
194
+ :param categorical_feats:
195
+ :param feature_shift_decoder:
196
+ :param N_ensemble_configurations:
197
+ :param average_logits:
198
+ :param normalize_with_sqrt:
199
+ :param metric_used:
200
+ :return:
201
+ """
202
+ num_classes = len(torch.unique(eval_ys))
203
+
204
+ def predict(eval_xs, eval_ys, used_style, softmax_temperature, return_logits):
205
+ # Initialize results array size S, B, Classes
206
+
207
+ inference_mode_call = torch.inference_mode() if inference_mode else NOP()
208
+ with inference_mode_call:
209
+ output = model(
210
+ (used_style.repeat(eval_xs.shape[1], 1) if used_style is not None else None, eval_xs, eval_ys.float()),
211
+ single_eval_pos=eval_position)[:, :, 0:num_classes]
212
+
213
+ output = output[:, :, 0:num_classes] / torch.exp(softmax_temperature)
214
+ if not return_logits:
215
+ output = torch.nn.functional.softmax(output, dim=-1)
216
+ #else:
217
+ # output[:, :, 1] = model((style.repeat(eval_xs.shape[1], 1) if style is not None else None, eval_xs, eval_ys.float()),
218
+ # single_eval_pos=eval_position)
219
+
220
+ # output[:, :, 1] = torch.sigmoid(output[:, :, 1]).squeeze(-1)
221
+ # output[:, :, 0] = 1 - output[:, :, 1]
222
+
223
+ #print('RESULTS', eval_ys.shape, torch.unique(eval_ys, return_counts=True), output.mean(axis=0))
224
+
225
+ return output
226
+
227
+ def preprocess_input(eval_xs, preprocess_transform):
228
+ import warnings
229
+
230
+ if eval_xs.shape[1] > 1:
231
+ raise Exception("Transforms only allow one batch dim - TODO")
232
+ if preprocess_transform != 'none':
233
+ if preprocess_transform == 'power' or preprocess_transform == 'power_all':
234
+ pt = PowerTransformer(standardize=True)
235
+ elif preprocess_transform == 'quantile' or preprocess_transform == 'quantile_all':
236
+ pt = QuantileTransformer(output_distribution='normal')
237
+ elif preprocess_transform == 'robust' or preprocess_transform == 'robust_all':
238
+ pt = RobustScaler(unit_variance=True)
239
+
240
+ # eval_xs, eval_ys = normalize_data(eval_xs), normalize_data(eval_ys)
241
+ eval_xs = normalize_data(eval_xs)
242
+
243
+ # Removing empty features
244
+ eval_xs = eval_xs[:, 0, :].cpu().numpy()
245
+ sel = [len(np.unique(eval_xs[0:eval_ys.shape[0], col])) > 1 for col in range(eval_xs.shape[1])]
246
+ eval_xs = np.array(eval_xs[:, sel])
247
+
248
+ warnings.simplefilter('error')
249
+ if preprocess_transform != 'none':
250
+ feats = set(range(eval_xs.shape[1])) if 'all' in preprocess_transform else set(
251
+ range(eval_xs.shape[1])) - set(categorical_feats)
252
+ for col in feats:
253
+ try:
254
+ pt.fit(eval_xs[0:eval_ys.shape[0], col:col + 1])
255
+ trans = pt.transform(eval_xs[:, col:col + 1])
256
+ # print(scipy.stats.spearmanr(trans[~np.isnan(eval_xs[:, col:col+1])], eval_xs[:, col:col+1][~np.isnan(eval_xs[:, col:col+1])]))
257
+ eval_xs[:, col:col + 1] = trans
258
+ except:
259
+ pass
260
+ warnings.simplefilter('default')
261
+
262
+ eval_xs = torch.tensor(eval_xs).float().unsqueeze(1).to(device)
263
+
264
+ # eval_xs = normalize_data(eval_xs)
265
+
266
+ # TODO: Cautian there is information leakage when to_ranking is used, we should not use it
267
+ eval_xs = remove_outliers(eval_xs) if not normalize_to_ranking else normalize_data(to_ranking_low_mem(eval_xs))
268
+
269
+ # Rescale X
270
+ eval_xs = normalize_by_used_features_f(eval_xs, eval_xs.shape[-1], max_features,
271
+ normalize_with_sqrt=normalize_with_sqrt)
272
+ return eval_xs.detach()
273
+
274
+ eval_xs, eval_ys = eval_xs.to(device), eval_ys.to(device)
275
+ eval_ys = eval_ys[:eval_position]
276
+
277
+ model.to(device)
278
+ style = style.to(device)
279
+
280
+ model.eval()
281
+
282
+ import itertools
283
+ style = style.unsqueeze(0) if len(style.shape) == 1 else style
284
+ num_styles = style.shape[0]
285
+ styles_configurations = range(0, num_styles)
286
+ preprocess_transform_configurations = [preprocess_transform if i % 2 == 0 else 'none' for i in range(0, num_styles)]
287
+ if preprocess_transform == 'mix':
288
+ def get_preprocess(i):
289
+ if i == 0:
290
+ return 'power_all'
291
+ if i == 1:
292
+ return 'robust_all'
293
+ if i == 2:
294
+ return 'none'
295
+ preprocess_transform_configurations = [get_preprocess(i) for i in range(0, num_styles)]
296
+ styles_configurations = zip(styles_configurations, preprocess_transform_configurations)
297
+
298
+ feature_shift_configurations = range(0, eval_xs.shape[2]) if feature_shift_decoder else [0]
299
+ class_shift_configurations = range(0, len(torch.unique(eval_ys))) if multiclass_decoder == 'permutation' else [0]
300
+
301
+ ensemble_configurations = list(itertools.product(styles_configurations, feature_shift_configurations, class_shift_configurations))
302
+ random.shuffle(ensemble_configurations)
303
+ ensemble_configurations = ensemble_configurations[0:N_ensemble_configurations]
304
+
305
+ output = None
306
+
307
+ eval_xs_transformed = {}
308
+ for ensemble_configuration in ensemble_configurations:
309
+ (styles_configuration, preprocess_transform_configuration), feature_shift_configuration, class_shift_configuration = ensemble_configuration
310
+
311
+ style_ = style[styles_configuration:styles_configuration+1, :]
312
+ softmax_temperature_ = softmax_temperature[styles_configuration]
313
+
314
+ eval_xs_, eval_ys_ = eval_xs.clone(), eval_ys.clone()
315
+
316
+ if preprocess_transform_configuration in eval_xs_transformed:
317
+ eval_xs_ = eval_xs_transformed['preprocess_transform_configuration'].clone()
318
+ else:
319
+ eval_xs_ = preprocess_input(eval_xs_, preprocess_transform=preprocess_transform_configuration)
320
+ eval_xs_transformed['preprocess_transform_configuration'] = eval_xs_
321
+
322
+ eval_ys_ = ((eval_ys_ + class_shift_configuration) % num_classes).float()
323
+
324
+ eval_xs_ = torch.cat([eval_xs_[..., feature_shift_configuration:],eval_xs_[..., :feature_shift_configuration]],dim=-1)
325
+
326
+ # Extend X
327
+ if extend_features:
328
+ eval_xs_ = torch.cat(
329
+ [eval_xs_,
330
+ torch.zeros((eval_xs_.shape[0], eval_xs_.shape[1], max_features - eval_xs_.shape[2])).to(device)], -1)
331
+
332
+ #preprocess_transform_ = preprocess_transform if styles_configuration % 2 == 0 else 'none'
333
+ import warnings
334
+ with warnings.catch_warnings():
335
+ warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True. Gradients will be None")
336
+ output_ = checkpoint(predict, eval_xs_, eval_ys_, style_, softmax_temperature_, True)
337
+ output_ = torch.cat([output_[..., class_shift_configuration:],output_[..., :class_shift_configuration]],dim=-1)
338
+
339
+ #output_ = predict(eval_xs, eval_ys, style_, preprocess_transform_)
340
+ if not average_logits:
341
+ output_ = torch.nn.functional.softmax(output_, dim=-1)
342
+ output = output_ if output is None else output + output_
343
+
344
+ output = output / len(ensemble_configurations)
345
+ if average_logits:
346
+ output = torch.nn.functional.softmax(output, dim=-1)
347
+
348
+ output = torch.transpose(output, 0, 1)
349
+
350
+ return output
351
+
352
+ def get_params_from_config(c):
353
+ return {'max_features': c['num_features']
354
+ , 'rescale_features': c["normalize_by_used_features"]
355
+ , 'normalize_to_ranking': c["normalize_to_ranking"]
356
+ , 'normalize_with_sqrt': c.get("normalize_with_sqrt", False)
357
+ }
TabPFN/tabular_evaluation.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from tqdm import tqdm
6
+ import random
7
+ import numpy as np
8
+
9
+ from torch import nn
10
+
11
+ from utils import torch_nanmean
12
+ from datasets import *
13
+ from model_builder import load_model
14
+ from scripts.tabular_baselines import get_scoring_string
15
+ from scripts import tabular_metrics
16
+ from scripts.transformer_prediction_interface import *
17
+ from scripts.baseline_prediction_interface import *
18
+ """
19
+ ===============================
20
+ PUBLIC FUNCTIONS FOR EVALUATION
21
+ ===============================
22
+ """
23
+
24
+
25
+ def eval_model(i, e, valid_datasets, test_datasets, eval_positions, bptt, add_name, base_path, device='cpu', eval_addition='', **kwargs):
26
+ metrics_test, config_sample, model_path = eval_model_on_ds(i, e, test_datasets, eval_positions, bptt, add_name, base_path, device=device, eval_addition=eval_addition, **kwargs)
27
+ metrics_valid, _, _ = eval_model_on_ds(i, e, valid_datasets, eval_positions, bptt, add_name, base_path, device=device, eval_addition=eval_addition, **kwargs)
28
+ return {'mean_auc_test': metrics_test['mean_roc_at_1000'], 'mean_auc_valid': metrics_valid['mean_roc_at_1000'], 'mean_ce_test': metrics_test['mean_ce_at_1000'], 'mean_ce_valid': metrics_valid['mean_ce_at_1000'], 'config_sample': config_sample, 'model_path': model_path}
29
+
30
+ def eval_model_on_ds(i, e, valid_datasets, eval_positions, bptt, add_name, base_path, device='cpu', eval_addition='', **kwargs):
31
+
32
+ # How to use: evaluate_without_fitting(i,0,valid_datasets, [1024], 100000, add_name=model_string, base_path=base_path,)
33
+ def check_file(e):
34
+ model_file = f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{e}.cpkt'
35
+ model_path = os.path.join(base_path, model_file)
36
+ # print('Evaluate ', model_path)
37
+ results_file = os.path.join(base_path,
38
+ f'models_diff/prior_diff_real_results{add_name}_n_{i}_epoch_{e}_{eval_addition}.pkl')
39
+ if not Path(model_path).is_file(): # or Path(results_file).is_file():
40
+ # print('checkpoint exists: ', Path(model_file).is_file(), ', results are written:', Path(results_file).is_file())
41
+ return None, None, None
42
+ return model_file, model_path, results_file
43
+
44
+ if e == -1: # use last checkpoint, if e == -1
45
+ for e_ in range(100, -1, -1):
46
+ model_file_, model_path_, results_file_ = check_file(e_)
47
+ if model_file_ is not None:
48
+ e = e_
49
+ model_file, model_path, results_file = model_file_, model_path_, results_file_
50
+ break
51
+ else:
52
+ model_file, model_path, results_file = check_file(e)
53
+
54
+ model, config_sample = load_model(base_path, model_file, device, None, verbose=False)
55
+
56
+ params = {'max_features': config_sample['num_features']
57
+ , 'rescale_features': config_sample["normalize_by_used_features"]
58
+ , 'normalize_to_ranking': config_sample["normalize_to_ranking"]
59
+ , 'normalize_with_sqrt': config_sample.get("normalize_with_sqrt", False)
60
+ }
61
+ metrics_valid = evaluate(datasets=valid_datasets, model=model[2], method='transformer', device=device, overwrite=True,
62
+ extend_features=True
63
+ # just removed the style keyword but transformer is trained with style, just empty
64
+ , save=False
65
+ , metric_used=tabular_metrics.cross_entropy
66
+ , return_tensor=True
67
+ , verbose=False
68
+ , eval_positions=eval_positions
69
+ , bptt=bptt
70
+ , base_path=None
71
+ , inference_mode=True
72
+ , **params
73
+ , **kwargs)
74
+
75
+ tabular_metrics.calculate_score_per_method(tabular_metrics.auc_metric, 'roc', metrics_valid, valid_datasets, eval_positions)
76
+ tabular_metrics.calculate_score_per_method(tabular_metrics.cross_entropy, 'ce', metrics_valid, valid_datasets, eval_positions)
77
+
78
+ return metrics_valid, config_sample, model_path
79
+
80
+
81
+ def evaluate(datasets, bptt, eval_positions, metric_used, model
82
+ , verbose=False
83
+ , return_tensor=False
84
+ , **kwargs):
85
+ """
86
+ Evaluates a list of datasets for a model function.
87
+
88
+ :param datasets: List of datasets
89
+ :param bptt: maximum sequence length
90
+ :param eval_positions: List of positions where to evaluate models
91
+ :param verbose: If True, is verbose.
92
+ :param metric_used: Which metric is optimized for.
93
+ :param return_tensor: Wheater to return results as a pytorch.tensor or numpy, this is only relevant for transformer.
94
+ :param kwargs:
95
+ :return:
96
+ """
97
+ overall_result = {'metric_used': get_scoring_string(metric_used)
98
+ , 'bptt': bptt
99
+ , 'eval_positions': eval_positions}
100
+
101
+ aggregated_metric_datasets, num_datasets = torch.tensor(0.0), 0
102
+
103
+ # For each dataset
104
+ for [ds_name, X, y, categorical_feats, _, _] in tqdm.tqdm(datasets, desc='Iterate over datasets') if verbose else datasets:
105
+ dataset_bptt = min(len(X), bptt)
106
+ # if verbose and dataset_bptt < bptt:
107
+ # print(f'Dataset too small for given sequence length, reducing to {len(X)} ({bptt})')
108
+
109
+ aggregated_metric, num = torch.tensor(0.0), 0
110
+ ds_result = {}
111
+
112
+ for eval_position in (eval_positions if verbose else eval_positions):
113
+ eval_position_real = int(dataset_bptt * 0.5) if 2 * eval_position > dataset_bptt else eval_position
114
+ eval_position_bptt = int(eval_position_real * 2.0)
115
+
116
+ r = evaluate_position(X, y, model=model
117
+ , num_classes=len(torch.unique(y))
118
+ , categorical_feats = categorical_feats
119
+ , bptt = eval_position_bptt
120
+ , ds_name=ds_name
121
+ , eval_position = eval_position_real
122
+ , metric_used = metric_used
123
+ ,**kwargs)
124
+
125
+ if r is None:
126
+ continue
127
+
128
+ _, outputs, ys, best_configs, time_used = r
129
+
130
+ if torch.is_tensor(outputs):
131
+ outputs = outputs.to(outputs.device)
132
+ ys = ys.to(outputs.device)
133
+
134
+ ys = ys.T
135
+ ds_result[f'{ds_name}_best_configs_at_{eval_position}'] = best_configs
136
+ ds_result[f'{ds_name}_outputs_at_{eval_position}'] = outputs
137
+ ds_result[f'{ds_name}_ys_at_{eval_position}'] = ys
138
+ ds_result[f'{ds_name}_time_at_{eval_position}'] = time_used
139
+
140
+ new_metric = torch_nanmean(torch.stack([metric_used(ys[i], outputs[i]) for i in range(ys.shape[0])]))
141
+
142
+ if not return_tensor:
143
+ make_scalar = lambda x: float(x.detach().cpu().numpy()) if (torch.is_tensor(x) and (len(x.shape) == 0)) else x
144
+ new_metric = make_scalar(new_metric)
145
+ ds_result = {k: make_scalar(ds_result[k]) for k in ds_result.keys()}
146
+
147
+ lib = torch if return_tensor else np
148
+ if not lib.isnan(new_metric).any():
149
+ aggregated_metric, num = aggregated_metric + new_metric, num + 1
150
+
151
+ overall_result.update(ds_result)
152
+ if num > 0:
153
+ aggregated_metric_datasets, num_datasets = (aggregated_metric_datasets + (aggregated_metric / num)), num_datasets + 1
154
+
155
+ overall_result['mean_metric'] = aggregated_metric_datasets / num_datasets
156
+
157
+ return overall_result
158
+
159
+ """
160
+ ===============================
161
+ INTERNAL HELPER FUNCTIONS
162
+ ===============================
163
+ """
164
+
165
+ def check_file_exists(path):
166
+ """Checks if a pickle file exists. Returns None if not, else returns the unpickled file."""
167
+ if (os.path.isfile(path)):
168
+ print(f'loading results from {path}')
169
+ with open(path, 'rb') as f:
170
+ return np.load(f, allow_pickle=True).tolist()
171
+ return None
172
+
173
+ def generate_valid_split(X, y, bptt, eval_position, split_number=1):
174
+ """Generates a deteministic train-(test/valid) split. Both splits must contain the same classes and all classes in
175
+ the entire datasets. If no such split can be sampled in 7 passes, returns None.
176
+
177
+ :param X: torch tensor, feature values
178
+ :param y: torch tensor, class values
179
+ :param bptt: Number of samples in train + test
180
+ :param eval_position: Number of samples in train, i.e. from which index values are in test
181
+ :param split_number: The split id
182
+ :return:
183
+ """
184
+ done, seed = False, 13
185
+
186
+ torch.manual_seed(split_number)
187
+ perm = torch.randperm(X.shape[0]) if split_number > 1 else torch.arange(0, X.shape[0])
188
+ X, y = X[perm], y[perm]
189
+
190
+ while not done:
191
+ if seed > 20:
192
+ return None, None # No split could be generated in 7 passes, return None
193
+ random.seed(seed)
194
+ i = random.randint(0, len(X) - bptt) if len(X) - bptt > 0 else 0
195
+ y_ = y[i:i + bptt]
196
+
197
+ # Checks if all classes from dataset are contained and classes in train and test are equal (contain same
198
+ # classes) and
199
+ done = len(torch.unique(y_)) == len(torch.unique(y))
200
+ done = done and torch.all(torch.unique(y_) == torch.unique(y))
201
+ done = done and len(torch.unique(y_[:eval_position])) == len(torch.unique(y_[eval_position:]))
202
+ done = done and torch.all(torch.unique(y_[:eval_position]) == torch.unique(y_[eval_position:]))
203
+ seed = seed + 1
204
+
205
+ eval_xs = torch.stack([X[i:i + bptt].clone()], 1)
206
+ eval_ys = torch.stack([y[i:i + bptt].clone()], 1)
207
+
208
+ return eval_xs, eval_ys
209
+
210
+
211
+ def evaluate_position(X, y, categorical_feats, model, bptt
212
+ , eval_position, overwrite, save, base_path, path_interfix, method, ds_name, fetch_only=False
213
+ , max_time=300, split_number=1
214
+ , per_step_normalization=False, **kwargs):
215
+ """
216
+ Evaluates a dataset with a 'bptt' number of training samples.
217
+
218
+ :param X: Dataset X
219
+ :param y: Dataset labels
220
+ :param categorical_feats: Indices of categorical features.
221
+ :param model: Model function
222
+ :param bptt: Sequence length.
223
+ :param eval_position: Number of training samples.
224
+ :param overwrite: Wheater to ove
225
+ :param overwrite: If True, results on disk are overwritten.
226
+ :param save:
227
+ :param path_interfix: Used for constructing path to write on disk.
228
+ :param method: Model name.
229
+ :param ds_name: Datset name.
230
+ :param fetch_only: Wheater to calculate or only fetch results.
231
+ :param per_step_normalization:
232
+ :param kwargs:
233
+ :return:
234
+ """
235
+
236
+ if save:
237
+ path = os.path.join(base_path, f'results/tabular/{path_interfix}/results_{method}_{ds_name}_{eval_position}_{bptt}_{split_number}.npy')
238
+ #log_path =
239
+
240
+ ## Load results if on disk
241
+ if not overwrite:
242
+ result = check_file_exists(path)
243
+ if result is not None:
244
+ if not fetch_only:
245
+ print(f'Loaded saved result for {path}')
246
+ return result
247
+ elif fetch_only:
248
+ print(f'Could not load saved result for {path}')
249
+ return None
250
+
251
+ ## Generate data splits
252
+ eval_xs, eval_ys = generate_valid_split(X, y, bptt, eval_position, split_number=split_number)
253
+ if eval_xs is None:
254
+ return None
255
+ print(f"No dataset could be generated {ds_name} {bptt}")
256
+
257
+ eval_ys = (eval_ys > torch.unique(eval_ys).unsqueeze(0)).sum(axis=1).unsqueeze(-1)
258
+
259
+ start_time = time.time()
260
+
261
+ if isinstance(model, nn.Module): # Two separate predict interfaces for transformer and baselines
262
+ outputs, best_configs = transformer_predict(model, eval_xs, eval_ys, eval_position, categorical_feats=categorical_feats, **kwargs), None
263
+ else:
264
+ _, outputs, best_configs = baseline_predict(model, eval_xs, eval_ys, categorical_feats
265
+ , eval_pos=eval_position
266
+ , max_time=max_time, **kwargs)
267
+
268
+ eval_ys = eval_ys[eval_position:]
269
+ if outputs is None:
270
+ return None
271
+
272
+ if torch.is_tensor(outputs): # Transfers data to cpu for saving
273
+ outputs = outputs.cpu()
274
+ eval_ys = eval_ys.cpu()
275
+
276
+ ds_result = None, outputs, eval_ys, best_configs, time.time() - start_time
277
+
278
+ if save:
279
+ with open(path, 'wb') as f:
280
+ np.save(f, ds_result)
281
+ print(f'saved results to {path}')
282
+
283
+ return ds_result
TabPFN/train.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import itertools
3
+ import argparse
4
+ import time
5
+ import datetime
6
+ import yaml
7
+ from contextlib import nullcontext
8
+
9
+
10
+ import torch
11
+ from torch import nn
12
+
13
+ import utils
14
+ from transformer import TransformerModel
15
+ from utils import get_cosine_schedule_with_warmup, get_openai_lr, StoreDictKeyPair, get_weighted_single_eval_pos_sampler, get_uniform_single_eval_pos_sampler
16
+ import priors
17
+ import encoders
18
+ import positional_encodings
19
+ from utils import init_dist
20
+ from torch.cuda.amp import autocast
21
+
22
+ class Losses():
23
+ gaussian = nn.GaussianNLLLoss(full=True, reduction='none')
24
+ mse = nn.MSELoss(reduction='none')
25
+ ce = lambda weight : nn.CrossEntropyLoss(reduction='none', weight=weight)
26
+ bce = nn.BCEWithLogitsLoss(reduction='none')
27
+
28
+
29
+ def train(priordataloader_class, criterion, encoder_generator, emsize=200, nhid=200, nlayers=6, nhead=2, dropout=0.2,
30
+ epochs=10, steps_per_epoch=100, batch_size=200, bptt=10, lr=None, weight_decay=0.0, warmup_epochs=10, input_normalization=False,
31
+ y_encoder_generator=None, pos_encoder_generator=None, decoder=None, extra_prior_kwargs_dict={}, scheduler=get_cosine_schedule_with_warmup,
32
+ load_weights_from_this_state_dict=None, validation_period=10, single_eval_pos_gen=None, bptt_extra_samples=None, gpu_device='cuda:0',
33
+ aggregate_k_gradients=1, verbose=True, style_encoder_generator=None, check_is_compatible=True, epoch_callback=None,
34
+ initializer=None, initialize_with_model=None, train_mixed_precision=False, total_available_time_in_s=None, normalize_labels=True, **model_extra_args
35
+ ):
36
+ assert (epochs is None) != (total_available_time_in_s is None)
37
+ start_of_training = time.time()
38
+ device = gpu_device if torch.cuda.is_available() else 'cpu:0'
39
+ print(f'Using {device} device')
40
+ using_dist, rank, device = init_dist(device)
41
+ bptt_sampler = (lambda : single_eval_pos_gen() + bptt_extra_samples if callable(single_eval_pos_gen) else single_eval_pos_gen + bptt_extra_samples) if bptt_extra_samples is not None else bptt
42
+ dl = priordataloader_class(num_steps=steps_per_epoch, batch_size=batch_size, seq_len=bptt_sampler, seq_len_maximum=bptt+(bptt_extra_samples if bptt_extra_samples else 0), device=device, **extra_prior_kwargs_dict)
43
+ if dl.fuse_x_y:
44
+ raise Exception("Illegal parameter")
45
+
46
+ encoder = encoder_generator(dl.num_features+1 if dl.fuse_x_y else dl.num_features,emsize)
47
+ style_def = next(iter(dl))[0][0] # This is (style, x, y), target with x and y with batch size
48
+
49
+ style_encoder = style_encoder_generator(hyperparameter_definitions=style_def[0], em_size=emsize) if (style_def is not None) else None
50
+ n_out = dl.num_outputs
51
+ if isinstance(criterion, nn.GaussianNLLLoss):
52
+ n_out *= 2
53
+ elif isinstance(criterion, nn.CrossEntropyLoss):
54
+ n_out *= criterion.weight.shape[0]
55
+ model = TransformerModel(encoder, n_out, emsize, nhead, nhid, nlayers, dropout, style_encoder=style_encoder,
56
+ y_encoder=y_encoder_generator(dl.num_outputs, emsize), input_normalization=input_normalization,
57
+ pos_encoder=(pos_encoder_generator or positional_encodings.NoPositionalEncoding)(emsize, bptt*2),
58
+ decoder=decoder, init_method=initializer, **model_extra_args
59
+ )
60
+ model.criterion = criterion
61
+ if load_weights_from_this_state_dict is not None:
62
+ model.load_state_dict(load_weights_from_this_state_dict)
63
+ if initialize_with_model is not None:
64
+ model.init_from_small_model(initialize_with_model)
65
+
66
+ print(f"Using a Transformer with {sum(p.numel() for p in model.parameters())/1000/1000:.{2}f} M parameters")
67
+
68
+ try:
69
+ for (k, v), (k2, v2) in zip(model.state_dict().items(), initialize_with_model.state_dict().items()):
70
+ print(k, ((v - v2) / v).abs().mean(), v.shape)
71
+ except Exception:
72
+ pass
73
+
74
+ model.to(device)
75
+ if using_dist:
76
+ print("Distributed training")
77
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank, broadcast_buffers=False)
78
+
79
+
80
+ # learning rate
81
+ if lr is None:
82
+ lr = get_openai_lr(model)
83
+ print(f"Using OpenAI max lr of {lr}.")
84
+ optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
85
+ scheduler = scheduler(optimizer, warmup_epochs, epochs if epochs is not None else 100) # when training for fixed time lr schedule takes 100 steps
86
+
87
+ def train_step():
88
+ model.train() # Turn on the train mode
89
+ total_loss = 0.
90
+ total_positional_losses = 0.
91
+ total_positional_losses_recorded = 0
92
+ before_get_batch = time.time()
93
+ assert len(dl) % aggregate_k_gradients == 0, 'Please set the number of steps per epoch s.t. `aggregate_k_gradients` divides it.'
94
+ valid_batch_steps = 0.0
95
+ for batch, (data, targets) in enumerate(dl):
96
+ if using_dist and not (batch % aggregate_k_gradients == aggregate_k_gradients - 1):
97
+ cm = model.no_sync()
98
+ #print(f'p={rank}, no_sync', force=True)
99
+ else:
100
+ cm = nullcontext()
101
+ #print(f'p={rank}, sync', force=True)
102
+ with cm:
103
+ time_to_get_batch = time.time() - before_get_batch
104
+ before_forward = time.time()
105
+ if bptt_extra_samples is None:
106
+ single_eval_pos = single_eval_pos_gen() if callable(single_eval_pos_gen) else single_eval_pos_gen
107
+ else:
108
+ single_eval_pos = targets.shape[0] - bptt_extra_samples
109
+
110
+ is_compatible = torch.ones((targets.shape[1])).bool()
111
+ if check_is_compatible or normalize_labels:
112
+ for b in range(targets.shape[1]):
113
+ targets_in_train = torch.unique(targets[:single_eval_pos, b], sorted=True)
114
+ targets_in_eval = torch.unique(targets[single_eval_pos:, b], sorted=True)
115
+
116
+ if check_is_compatible:
117
+ is_compatible[b] = len(targets_in_train) == len(targets_in_eval) and (targets_in_train == targets_in_eval).all()
118
+ is_compatible[b] = is_compatible[b] and len(targets_in_train) > 1
119
+
120
+ # Set targets to range starting from 0 (e.g. targets 0, 2, 5, 2 will be converted to 0, 1, 2, 1)
121
+ if normalize_labels:
122
+ targets[:, b] = (targets[:, b] > torch.unique(targets[:, b]).unsqueeze(1)).sum(axis=0).unsqueeze(0)
123
+ valid_batch_steps += is_compatible.float().mean()
124
+ is_compatible = is_compatible.to(device)
125
+ #if using_dist and check_is_compatible:
126
+ # print('step share before reduce',curr_step_share, force=True)
127
+ # curr_step_share = curr_step_share.to(device)
128
+ # torch.distributed.all_reduce_multigpu([curr_step_share], op=torch.distributed.ReduceOp.SUM)
129
+ # curr_step_share = curr_step_share.cpu() / torch.distributed.get_world_size()
130
+ # print('step share after reduce',curr_step_share, torch.distributed.get_world_size(), force=True)
131
+
132
+ # If style is set to None, it should not be transferred to device
133
+ output = model(tuple(e.to(device) if torch.is_tensor(e) else e for e in data) if isinstance(data, tuple) else data.to(device)
134
+ , single_eval_pos=single_eval_pos)
135
+
136
+ forward_time = time.time() - before_forward
137
+
138
+ #output, targets = output[:, is_compatible], targets[:, is_compatible]
139
+
140
+ if single_eval_pos is not None:
141
+ targets = targets[single_eval_pos:]
142
+ if isinstance(criterion, nn.GaussianNLLLoss):
143
+ assert output.shape[-1] == 2, \
144
+ 'need to write a little bit of code to handle multiple regression targets at once'
145
+
146
+ mean_pred = output[..., 0]
147
+ var_pred = output[..., 1].abs()
148
+ losses = criterion(mean_pred.flatten(), targets.to(device).flatten(), var=var_pred.flatten())
149
+ elif isinstance(criterion, (nn.MSELoss, nn.BCEWithLogitsLoss)):
150
+ losses = criterion(output.flatten(), targets.to(device).flatten())
151
+ elif isinstance(criterion, (nn.CrossEntropyLoss)):
152
+ #print(n_out, targets.min(), targets.max(), force=True)
153
+ losses = criterion(output.reshape(-1, n_out), targets.to(device).long().flatten())
154
+ else:
155
+ losses = criterion(output.reshape(-1, n_out), targets.to(device).flatten())
156
+ losses = losses.view(*output.shape[0:2])
157
+ loss = losses.mean(0) @ is_compatible.float() / losses.shape[1]
158
+ #loss = torch_nanmean(losses, axis=[0, 1]) * is_compatible.float().mean()
159
+ # not sure whether we can go without the nan checks.
160
+
161
+ loss.backward()
162
+
163
+ if ((batch % aggregate_k_gradients == aggregate_k_gradients - 1) and (not check_is_compatible or using_dist))\
164
+ or (valid_batch_steps >= aggregate_k_gradients and (check_is_compatible and not using_dist)):
165
+ with torch.no_grad():
166
+ for p in model.parameters():
167
+ if p.grad is not None:
168
+ p.grad.div_(valid_batch_steps)
169
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
170
+ try:
171
+ optimizer.step()
172
+ except:
173
+ print("Invalid optimization step encountered")
174
+ optimizer.zero_grad()
175
+ valid_batch_steps = 0.0
176
+
177
+ step_time = time.time() - before_forward
178
+
179
+ if not torch.isnan(loss):
180
+ total_loss += loss.item()
181
+ total_positional_losses += losses.mean(1).cpu().detach() if single_eval_pos is None else \
182
+ nn.functional.one_hot(torch.tensor(single_eval_pos), bptt)*loss.cpu().detach()
183
+
184
+ total_positional_losses_recorded += torch.ones(bptt) if single_eval_pos is None else \
185
+ nn.functional.one_hot(torch.tensor(single_eval_pos), bptt)
186
+
187
+ before_get_batch = time.time()
188
+ return total_loss / steps_per_epoch, (
189
+ total_positional_losses / total_positional_losses_recorded).tolist(), time_to_get_batch, forward_time, step_time
190
+
191
+ best_val_loss = float("inf")
192
+ best_model = None
193
+ total_loss = float('inf')
194
+ total_positional_losses = float('inf')
195
+ try:
196
+ for epoch in (range(1, epochs + 1) if epochs is not None else itertools.count(1)):
197
+
198
+ epoch_start_time = time.time()
199
+ if train_mixed_precision:
200
+ with autocast():
201
+ total_loss, total_positional_losses, time_to_get_batch, forward_time, step_time = train_step()
202
+ else:
203
+ total_loss, total_positional_losses, time_to_get_batch, forward_time, step_time = train_step()
204
+ if hasattr(dl, 'validate') and epoch % validation_period == 0:
205
+ with torch.no_grad():
206
+ val_score = dl.validate(model)
207
+ else:
208
+ val_score = None
209
+
210
+ if verbose:
211
+ print('-' * 89)
212
+ print(
213
+ f'| end of epoch {epoch:3d} | time: {(time.time() - epoch_start_time):5.2f}s | mean loss {total_loss:5.2f} | '
214
+ f"pos losses {','.join([f'{l:5.2f}' for l in total_positional_losses])}, lr {scheduler.get_last_lr()[0]}"
215
+ f' data time {time_to_get_batch:5.2f} step time {step_time:5.2f}'
216
+ f' forward time {forward_time:5.2f}' + (f'val score {val_score}' if val_score is not None else ''))
217
+ print('-' * 89)
218
+
219
+ # stepping with wallclock time based scheduler
220
+ current_time = time.time()
221
+ if epoch_callback is not None and rank == 0:
222
+ epoch_callback(model, epoch / epochs if total_available_time_in_s is None else # noqa
223
+ (current_time - start_of_training) / total_available_time_in_s # noqa
224
+ )
225
+ if epochs is None and (current_time - start_of_training) > total_available_time_in_s: # noqa
226
+ break
227
+ if epochs is None:
228
+ scheduler.step((current_time - epoch_start_time) / total_available_time_in_s * 100)
229
+ else:
230
+ scheduler.step()
231
+ except KeyboardInterrupt:
232
+ pass
233
+
234
+ return total_loss, total_positional_losses, model.to('cpu'), dl
235
+
236
+ def _parse_args(config_parser, parser):
237
+ # Do we have a config file to parse?
238
+ args_config, remaining = config_parser.parse_known_args()
239
+ if args_config.config:
240
+ with open(args_config.config, 'r') as f:
241
+ cfg = yaml.safe_load(f)
242
+ parser.set_defaults(**cfg)
243
+
244
+ # The main arg parser parses the rest of the args, the usual
245
+ # defaults will have been overridden if config file specified.
246
+ args = parser.parse_args(remaining)
247
+
248
+ # Cache the args as a text string to save them in the output dir later
249
+ args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
250
+ return args, args_text
251
+
252
+
253
+ if __name__ == '__main__':
254
+ config_parser = argparse.ArgumentParser(description='Only used as a first parser for the config file path.')
255
+ config_parser.add_argument('--config')
256
+ parser = argparse.ArgumentParser()
257
+ parser.add_argument('prior')
258
+ parser.add_argument('--loss_function', default='barnll')
259
+ # Optional Arg's for `--loss_function barnll`
260
+ parser.add_argument('--min_y', type=float, help='barnll can only model y in strict ranges, this is the minimum y can take.')
261
+ parser.add_argument('--max_y', type=float, help='barnll can only model y in strict ranges, this is the maximum y can take.')
262
+ parser.add_argument('--num_buckets', default=100, type=int)
263
+ #parser.add_argument('--num_features', default=None, type=int, help='Specify depending on the prior.')
264
+ parser.add_argument("--extra_prior_kwargs_dict", default={'fuse_x_y': False}, dest="extra_prior_kwargs_dict", action=StoreDictKeyPair, nargs="+", metavar="KEY=VAL", help='Specify depending on the prior.')
265
+ parser.add_argument('--encoder', default='linear', type=str, help='Specify depending on the prior.')
266
+ parser.add_argument('--y_encoder', default='linear', type=str, help='Specify depending on the prior. You should specify this if you do not fuse x and y.')
267
+ parser.add_argument('--pos_encoder', default='sinus', type=str, help='Specify depending on the prior.')
268
+ parser.add_argument('--bptt', default=10, type=int)
269
+ parser.add_argument('--epochs', default=200, type=int)
270
+ parser.add_argument('--warmup_epochs', default=50, type=int)
271
+ parser.add_argument('--validation_period', default=10, type=int)
272
+ parser.add_argument('--permutation_invariant_max_eval_pos', default=None, type=int, help='Set this to an int to ')
273
+ parser.add_argument('--permutation_invariant_sampling', default='weighted', help="Only relevant if --permutation_invariant_max_eval_pos is set.")
274
+
275
+ # these can likely be mostly left at defaults
276
+ parser.add_argument('--emsize', default=512, type=int) # sometimes even larger is better e.g. 1024
277
+ parser.add_argument('--nlayers', default=6, type=int)
278
+ parser.add_argument('--nhid', default=None, type=int) # 2*emsize is the default
279
+ parser.add_argument('--nhead', default=4, type=int) # nhead = emsize / 64 in the original paper
280
+ parser.add_argument('--dropout', default=.0, type=float)
281
+ parser.add_argument('--steps_per_epoch', default=10, type=int)
282
+ parser.add_argument('--batch_size', default=1000, type=int)
283
+ parser.add_argument('--lr', '--learning_rate', default=.001, type=float) # try also .0003, .0001, go lower with lower batch size
284
+
285
+ args, _ = _parse_args(config_parser, parser)
286
+
287
+ if args.nhid is None:
288
+ args.nhid = 2*args.emsize
289
+
290
+ prior = args.__dict__.pop('prior')
291
+
292
+ if prior == 'gp':
293
+ prior = priors.fast_gp.DataLoader
294
+ elif prior == 'ridge':
295
+ prior = priors.ridge.DataLoader
296
+ elif prior == 'stroke':
297
+ prior = priors.stroke.DataLoader
298
+ elif prior == 'mix_gp':
299
+ prior = priors.fast_gp_mix.DataLoader
300
+ else:
301
+ raise NotImplementedError(f'Prior == {prior}.')
302
+
303
+ loss_function = args.__dict__.pop('loss_function')
304
+
305
+ criterion = nn.GaussianNLLLoss(reduction='none', full=True)
306
+ classificiation_criterion = nn.CrossEntropyLoss(reduction='none')
307
+ num_buckets = args.__dict__.pop('num_buckets')
308
+ max_y = args.__dict__.pop('max_y')
309
+ min_y = args.__dict__.pop('min_y')
310
+ # criterion = nn.MSELoss(reduction='none')
311
+
312
+ def get_y_sample():
313
+ dl = prior(num_steps=1, batch_size=args.batch_size * args.steps_per_epoch, seq_len=args.bptt, device=device,
314
+ **args.extra_prior_kwargs_dict)
315
+ y_sample = next(iter(dl))[-1]
316
+ print(f'Creating Bar distribution with borders from y sample of size {y_sample.numel()}')
317
+ return y_sample
318
+
319
+ if loss_function == 'ce':
320
+ criterion = nn.CrossEntropyLoss(reduction='none')
321
+ elif loss_function == 'gaussnll':
322
+ criterion = nn.GaussianNLLLoss(reduction='none', full=True)
323
+ elif loss_function == 'mse':
324
+ criterion = nn.MSELoss(reduction='none')
325
+ elif loss_function == 'barnll':
326
+ criterion = BarDistribution(borders=get_bucket_limits(num_buckets, full_range=(min_y,max_y)))
327
+ elif loss_function == 'adaptivebarnll':
328
+ borders = get_bucket_limits(num_buckets, ys=get_y_sample(), full_range=(min_y,max_y))
329
+ criterion = BarDistribution(borders=borders)
330
+ elif loss_function == 'adaptivefullsupportbarnll':
331
+ assert min_y is None and max_y is None, "Please do not specify `min_y` and `max_y` with `unboundedadaptivebarnll`."
332
+ borders = get_bucket_limits(num_buckets, ys=get_y_sample())
333
+ criterion = FullSupportBarDistribution(borders=borders)
334
+ else:
335
+ raise NotImplementedError(f'loss_function == {loss_function}.')
336
+
337
+
338
+
339
+ encoder = args.__dict__.pop('encoder')
340
+ y_encoder = args.__dict__.pop('y_encoder')
341
+
342
+ def get_encoder_generator(encoder):
343
+ if encoder == 'linear':
344
+ encoder_generator = encoders.Linear
345
+ elif encoder == 'mlp':
346
+ encoder_generator = encoders.MLP
347
+ elif encoder == 'positional':
348
+ encoder_generator = encoders.Positional
349
+ else:
350
+ raise NotImplementedError(f'A {encoder} encoder is not valid.')
351
+ return encoder_generator
352
+
353
+ encoder_generator = get_encoder_generator(encoder)
354
+ y_encoder_generator = get_encoder_generator(y_encoder)
355
+
356
+ pos_encoder = args.__dict__.pop('pos_encoder')
357
+
358
+ if pos_encoder == 'none':
359
+ pos_encoder_generator = None
360
+ elif pos_encoder == 'sinus':
361
+ pos_encoder_generator = positional_encodings.PositionalEncoding
362
+ elif pos_encoder == 'learned':
363
+ pos_encoder_generator = positional_encodings.LearnedPositionalEncoding
364
+ elif pos_encoder == 'paired_scrambled_learned':
365
+ pos_encoder_generator = positional_encodings.PairedScrambledPositionalEncodings
366
+ else:
367
+ raise NotImplementedError(f'pos_encoer == {pos_encoder} is not valid.')
368
+
369
+ permutation_invariant_max_eval_pos = args.__dict__.pop('permutation_invariant_max_eval_pos')
370
+ permutation_invariant_sampling = args.__dict__.pop('permutation_invariant_sampling')
371
+ if permutation_invariant_max_eval_pos is not None:
372
+ if permutation_invariant_sampling == 'weighted':
373
+ get_sampler = get_weighted_single_eval_pos_sampler
374
+ elif permutation_invariant_sampling == 'uniform':
375
+ get_sampler = get_uniform_single_eval_pos_sampler
376
+ else:
377
+ raise ValueError()
378
+ args.__dict__['single_eval_pos_gen'] = get_sampler(permutation_invariant_max_eval_pos)
379
+
380
+
381
+ print("ARGS for `train`:", args.__dict__)
382
+
383
+ train(prior, criterion, encoder_generator,
384
+ y_encoder_generator=y_encoder_generator, pos_encoder_generator=pos_encoder_generator,
385
+ **args.__dict__)
386
+
TabPFN/transformer.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Optional
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from torch import Tensor
7
+ from torch.nn import Module, TransformerEncoder
8
+
9
+ from layer import TransformerEncoderLayer, _get_activation_fn
10
+ from utils import SeqBN, bool_mask_to_att_mask
11
+
12
+
13
+
14
+ class TransformerModel(nn.Module):
15
+ def __init__(self, encoder, n_out, ninp, nhead, nhid, nlayers, dropout=0.0, style_encoder=None, y_encoder=None,
16
+ pos_encoder=None, decoder=None, input_normalization=False, init_method=None, pre_norm=False,
17
+ activation='gelu', recompute_attn=False, num_global_att_tokens=0, full_attention=False,
18
+ all_layers_same_init=True):
19
+ super().__init__()
20
+ self.model_type = 'Transformer'
21
+ encoder_layer_creator = lambda: TransformerEncoderLayer(ninp, nhead, nhid, dropout, activation=activation,
22
+ pre_norm=pre_norm, recompute_attn=recompute_attn)
23
+ self.transformer_encoder = TransformerEncoder(encoder_layer_creator(), nlayers)\
24
+ if all_layers_same_init else TransformerEncoderDiffInit(encoder_layer_creator, nlayers)
25
+ self.ninp = ninp
26
+ self.encoder = encoder
27
+ self.y_encoder = y_encoder
28
+ self.pos_encoder = pos_encoder
29
+ self.decoder = decoder(ninp, nhid, n_out) if decoder is not None else nn.Sequential(nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, n_out))
30
+ self.input_ln = SeqBN(ninp) if input_normalization else None
31
+ self.style_encoder = style_encoder
32
+ self.init_method = init_method
33
+ if num_global_att_tokens is not None:
34
+ assert not full_attention
35
+ self.global_att_embeddings = nn.Embedding(num_global_att_tokens, ninp) if num_global_att_tokens else None
36
+ self.full_attention = full_attention
37
+
38
+ self.n_out = n_out
39
+ self.nhid = nhid
40
+
41
+ self.init_weights()
42
+
43
+ @staticmethod
44
+ def generate_square_subsequent_mask(sz):
45
+ mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
46
+ return bool_mask_to_att_mask(mask)
47
+
48
+ @staticmethod
49
+ def generate_D_q_matrix(sz, query_size):
50
+ train_size = sz-query_size
51
+ mask = torch.zeros(sz,sz) == 0
52
+ mask[:,train_size:].zero_()
53
+ mask |= torch.eye(sz) == 1
54
+ return bool_mask_to_att_mask(mask)
55
+
56
+ @staticmethod
57
+ def generate_global_att_query_matrix(num_global_att_tokens, seq_len, num_query_tokens):
58
+ train_size = seq_len + num_global_att_tokens - num_query_tokens
59
+ sz = seq_len + num_global_att_tokens
60
+ mask = torch.zeros(num_query_tokens, sz) == 0
61
+ mask[:,train_size:].zero_()
62
+ mask[:,train_size:] |= torch.eye(num_query_tokens) == 1
63
+ return bool_mask_to_att_mask(mask)
64
+
65
+ @staticmethod
66
+ def generate_global_att_trainset_matrix(num_global_att_tokens, seq_len, num_query_tokens):
67
+ train_size = seq_len + num_global_att_tokens - num_query_tokens
68
+ trainset_size = seq_len - num_query_tokens
69
+ mask = torch.zeros(trainset_size, num_global_att_tokens) == 0
70
+ #mask[:,num_global_att_tokens:].zero_()
71
+ #mask[:,num_global_att_tokens:] |= torch.eye(trainset_size) == 1
72
+ return bool_mask_to_att_mask(mask)
73
+
74
+ @staticmethod
75
+ def generate_global_att_globaltokens_matrix(num_global_att_tokens, seq_len, num_query_tokens):
76
+ mask = torch.zeros(num_global_att_tokens, num_global_att_tokens+seq_len-num_query_tokens) == 0
77
+ return bool_mask_to_att_mask(mask)
78
+
79
+ def init_weights(self):
80
+ initrange = 1.
81
+ # if isinstance(self.encoder,EmbeddingEncoder):
82
+ # self.encoder.weight.data.uniform_(-initrange, initrange)
83
+ # self.decoder.bias.data.zero_()
84
+ # self.decoder.weight.data.uniform_(-initrange, initrange)
85
+ if self.init_method is not None:
86
+ self.apply(self.init_method)
87
+ for layer in self.transformer_encoder.layers:
88
+ nn.init.zeros_(layer.linear2.weight)
89
+ nn.init.zeros_(layer.linear2.bias)
90
+ attns = layer.self_attn if isinstance(layer.self_attn, nn.ModuleList) else [layer.self_attn]
91
+ for attn in attns:
92
+ nn.init.zeros_(attn.out_proj.weight)
93
+ nn.init.zeros_(attn.out_proj.bias)
94
+
95
+ def forward(self, src, src_mask=None, single_eval_pos=None):
96
+ assert isinstance(src, tuple), 'fuse_x_y is forbidden, that is inputs have to be given as (x,y) or (style,x,y)'
97
+
98
+ if len(src) == 2:
99
+ src = (None,) + src
100
+
101
+ style_src, style_src_size = (src[0], (0 if (src[0] is None) else 1))
102
+ if src_mask is not None: assert self.global_att_embeddings is None or isinstance(src_mask, tuple)
103
+ if src_mask is None:
104
+ x_src = src[1]
105
+ if self.global_att_embeddings is None:
106
+ full_len = len(x_src) + style_src_size
107
+ if self.full_attention:
108
+ src_mask = bool_mask_to_att_mask(torch.ones((full_len, full_len), dtype=torch.bool)).to(x_src.device)
109
+ else:
110
+ src_mask = self.generate_D_q_matrix(len(x_src) + style_src_size, len(x_src) + style_src_size -single_eval_pos).to(x_src.device)
111
+ else:
112
+ src_mask_args = (self.global_att_embeddings.num_embeddings,
113
+ len(x_src) + style_src_size,
114
+ len(x_src) + style_src_size - single_eval_pos)
115
+ src_mask = (self.generate_global_att_globaltokens_matrix(*src_mask_args).to(x_src.device),
116
+ self.generate_global_att_trainset_matrix(*src_mask_args).to(x_src.device),
117
+ self.generate_global_att_query_matrix(*src_mask_args).to(x_src.device))
118
+
119
+ style_src, x_src, y_src = src
120
+ x_src = self.encoder(x_src)
121
+ y_src = self.y_encoder(y_src.unsqueeze(-1) if len(y_src.shape) < len(x_src.shape) else y_src)
122
+ style_src = self.style_encoder(style_src).unsqueeze(0) if self.style_encoder else torch.tensor([], device=x_src.device)
123
+ global_src = torch.tensor([], device=x_src.device) if self.global_att_embeddings is None else \
124
+ self.global_att_embeddings.weight.unsqueeze(1).repeat(1, x_src.shape[1], 1)
125
+ train_x = x_src[:single_eval_pos] + y_src[:single_eval_pos]
126
+ src = torch.cat([global_src, style_src, train_x, x_src[single_eval_pos:]], 0)
127
+
128
+ if self.input_ln is not None:
129
+ src = self.input_ln(src)
130
+
131
+ if self.pos_encoder is not None:
132
+ src = self.pos_encoder(src)
133
+
134
+ # If we have style input, drop its output
135
+ output = self.transformer_encoder(src, src_mask)[style_src_size:]
136
+ output = self.decoder(output)
137
+ return output[single_eval_pos+(self.global_att_embeddings.num_embeddings if self.global_att_embeddings else 0):]
138
+
139
+ @torch.no_grad()
140
+ def init_from_small_model(self, small_model):
141
+ assert isinstance(self.decoder, nn.Linear) and isinstance(self.encoder, (nn.Linear, nn.Sequential)) \
142
+ and isinstance(self.y_encoder, (nn.Linear, nn.Sequential))
143
+
144
+ def set_encoder_weights(my_encoder, small_model_encoder):
145
+ my_encoder_linear, small_encoder_linear = (my_encoder, small_model_encoder) \
146
+ if isinstance(my_encoder, nn.Linear) else (my_encoder[-1], small_model_encoder[-1])
147
+ small_in_dim = small_encoder_linear.out_features
148
+ my_encoder_linear.weight.zero_()
149
+ my_encoder_linear.bias.zero_()
150
+ my_encoder_linear.weight[:small_in_dim] = small_encoder_linear.weight
151
+ my_encoder_linear.bias[:small_in_dim] = small_encoder_linear.bias
152
+
153
+ set_encoder_weights(self.encoder, small_model.encoder)
154
+ set_encoder_weights(self.y_encoder, small_model.y_encoder)
155
+
156
+ small_in_dim = small_model.decoder.in_features
157
+
158
+ self.decoder.weight[:, :small_in_dim] = small_model.decoder.weight
159
+ self.decoder.bias = small_model.decoder.bias
160
+
161
+ for my_layer, small_layer in zip(self.transformer_encoder.layers, small_model.transformer_encoder.layers):
162
+ small_hid_dim = small_layer.linear1.out_features
163
+ my_in_dim = my_layer.linear1.in_features
164
+
165
+ # packed along q,k,v order in first dim
166
+ my_in_proj_w = my_layer.self_attn.in_proj_weight
167
+ small_in_proj_w = small_layer.self_attn.in_proj_weight
168
+
169
+ my_in_proj_w.view(3, my_in_dim, my_in_dim)[:, :small_in_dim, :small_in_dim] = small_in_proj_w.view(3,
170
+ small_in_dim,
171
+ small_in_dim)
172
+ my_layer.self_attn.in_proj_bias.view(3, my_in_dim)[:,
173
+ :small_in_dim] = small_layer.self_attn.in_proj_bias.view(3, small_in_dim)
174
+
175
+ my_layer.self_attn.out_proj.weight[:small_in_dim, :small_in_dim] = small_layer.self_attn.out_proj.weight
176
+ my_layer.self_attn.out_proj.bias[:small_in_dim] = small_layer.self_attn.out_proj.bias
177
+
178
+ my_layer.linear1.weight[:small_hid_dim, :small_in_dim] = small_layer.linear1.weight
179
+ my_layer.linear1.bias[:small_hid_dim] = small_layer.linear1.bias
180
+
181
+ my_layer.linear2.weight[:small_in_dim, :small_hid_dim] = small_layer.linear2.weight
182
+ my_layer.linear2.bias[:small_in_dim] = small_layer.linear2.bias
183
+
184
+ my_layer.norm1.weight[:small_in_dim] = math.sqrt(small_in_dim / my_in_dim) * small_layer.norm1.weight
185
+ my_layer.norm2.weight[:small_in_dim] = math.sqrt(small_in_dim / my_in_dim) * small_layer.norm2.weight
186
+
187
+ my_layer.norm1.bias[:small_in_dim] = small_layer.norm1.bias
188
+ my_layer.norm2.bias[:small_in_dim] = small_layer.norm2.bias
189
+
190
+
191
+ class TransformerEncoderDiffInit(Module):
192
+ r"""TransformerEncoder is a stack of N encoder layers
193
+
194
+ Args:
195
+ encoder_layer_creator: a function generating objects of TransformerEncoderLayer class without args (required).
196
+ num_layers: the number of sub-encoder-layers in the encoder (required).
197
+ norm: the layer normalization component (optional).
198
+ """
199
+ __constants__ = ['norm']
200
+
201
+ def __init__(self, encoder_layer_creator, num_layers, norm=None):
202
+ super().__init__()
203
+ self.layers = nn.ModuleList([encoder_layer_creator() for _ in range(num_layers)])
204
+ self.num_layers = num_layers
205
+ self.norm = norm
206
+
207
+ def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
208
+ r"""Pass the input through the encoder layers in turn.
209
+
210
+ Args:
211
+ src: the sequence to the encoder (required).
212
+ mask: the mask for the src sequence (optional).
213
+ src_key_padding_mask: the mask for the src keys per batch (optional).
214
+
215
+ Shape:
216
+ see the docs in Transformer class.
217
+ """
218
+ output = src
219
+
220
+ for mod in self.layers:
221
+ output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
222
+
223
+ if self.norm is not None:
224
+ output = self.norm(output)
225
+
226
+ return output
TabPFN/utils.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import argparse
4
+ import random
5
+ import datetime
6
+
7
+ import torch
8
+ from torch import nn
9
+ from torch.optim.lr_scheduler import LambdaLR
10
+ import numpy as np
11
+
12
+ # copied from huggingface
13
+ def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
14
+ """ Create a schedule with a learning rate that decreases following the
15
+ values of the cosine function between 0 and `pi * cycles` after a warmup
16
+ period during which it increases linearly between 0 and 1.
17
+ """
18
+
19
+ def lr_lambda(current_step):
20
+ if current_step < num_warmup_steps:
21
+ return float(current_step) / float(max(1, num_warmup_steps))
22
+ progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
23
+ return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
24
+
25
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
26
+
27
+ # copied from huggingface
28
+ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
29
+ """
30
+ Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
31
+ a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
32
+
33
+ Args:
34
+ optimizer (:class:`~torch.optim.Optimizer`):
35
+ The optimizer for which to schedule the learning rate.
36
+ num_warmup_steps (:obj:`int`):
37
+ The number of steps for the warmup phase.
38
+ num_training_steps (:obj:`int`):
39
+ The total number of training steps.
40
+ last_epoch (:obj:`int`, `optional`, defaults to -1):
41
+ The index of the last epoch when resuming training.
42
+
43
+ Return:
44
+ :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
45
+ """
46
+
47
+ def lr_lambda(current_step: int):
48
+ if current_step < num_warmup_steps:
49
+ return float(current_step) / float(max(1, num_warmup_steps))
50
+ return max(
51
+ 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
52
+ )
53
+
54
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
55
+
56
+
57
+ def get_openai_lr(transformer_model):
58
+ num_params = sum(p.numel() for p in transformer_model.parameters())
59
+ return 0.003239 - 0.0001395 * math.log(num_params)
60
+
61
+
62
+ def get_weighted_single_eval_pos_sampler(max_len):
63
+ """
64
+ This gives a sampler that can be used for `single_eval_pos` which yields good performance for all positions p,
65
+ where p <= `max_len`. At most `max_len` - 1 examples are shown to the Transformer.
66
+ :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
67
+ """
68
+ return lambda: random.choices(range(max_len), [1 / (max_len - i) for i in range(max_len)])[0]
69
+
70
+
71
+ def get_uniform_single_eval_pos_sampler(max_len, min_len=0):
72
+ """
73
+ Just sample any evaluation position with the same weight
74
+ :return: Sampler that can be fed to `train()` as `single_eval_pos_gen`.
75
+ """
76
+ return lambda: random.choices(range(min_len, max_len))[0]
77
+
78
+
79
+ class SeqBN(nn.Module):
80
+ def __init__(self, d_model):
81
+ super().__init__()
82
+ self.bn = nn.BatchNorm1d(d_model)
83
+ self.d_model = d_model
84
+
85
+ def forward(self, x):
86
+ assert self.d_model == x.shape[-1]
87
+ flat_x = x.view(-1, self.d_model)
88
+ flat_x = self.bn(flat_x)
89
+ return flat_x.view(*x.shape)
90
+
91
+
92
+ def set_locals_in_self(locals):
93
+ self = locals['self']
94
+ for var_name, val in locals.items():
95
+ if var_name != 'self': setattr(self, var_name, val)
96
+
97
+
98
+ default_device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0'
99
+
100
+
101
+ # Copied from StackOverflow, but we do an eval on the values additionally
102
+ class StoreDictKeyPair(argparse.Action):
103
+ def __init__(self, option_strings, dest, nargs=None, **kwargs):
104
+ self._nargs = nargs
105
+ super(StoreDictKeyPair, self).__init__(option_strings, dest, nargs=nargs, **kwargs)
106
+
107
+ def __call__(self, parser, namespace, values, option_string=None):
108
+ my_dict = {}
109
+ for kv in values:
110
+ k, v = kv.split("=")
111
+ try:
112
+ my_dict[k] = eval(v)
113
+ except NameError:
114
+ my_dict[k] = v
115
+ setattr(namespace, self.dest, my_dict)
116
+ print("dict values: {}".format(my_dict))
117
+
118
+ def get_nan_value(v, set_value_to_nan=0.0):
119
+ if random.random() < set_value_to_nan:
120
+ return v
121
+ else:
122
+ return random.choice([-999, 0, 1, 999])
123
+
124
+ def to_ranking(data):
125
+ x = (data >= data.unsqueeze(-3))
126
+ x = x.sum(0)
127
+ return x
128
+ # TODO: Is there a better way to do this?
129
+ # 1. Cmparing to unique elements: When all values are different we still get quadratic blowup
130
+ # 2. Argsort(Argsort()) returns ranking, but with duplicate values there is an ordering which is problematic
131
+ # 3. Argsort(Argsort(Unique))->Scatter seems a bit complicated, doesn't have quadratic blowup, but how fast?
132
+ def to_ranking_low_mem(data):
133
+ x = torch.zeros_like(data)
134
+ for col in range(data.shape[-1]):
135
+ x_ = (data[:, :, col] >= data[:, :, col].unsqueeze(-2))
136
+ x_ = x_.sum(0)
137
+ x[:, :, col] = x_
138
+ return x
139
+
140
+ def nan_handling_missing_for_unknown_reason_value(set_value_to_nan=0.0):
141
+ return get_nan_value(float('nan'), set_value_to_nan)
142
+
143
+ def nan_handling_missing_for_no_reason_value(set_value_to_nan=0.0):
144
+ return get_nan_value(float('-inf'), set_value_to_nan)
145
+
146
+ def nan_handling_missing_for_a_reason_value(set_value_to_nan=0.0):
147
+ return get_nan_value(float('inf'), set_value_to_nan)
148
+
149
+ def torch_nanmean(x, axis=0):
150
+ num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(axis=axis)
151
+ value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
152
+ return value / num
153
+
154
+ def torch_nanstd(x, axis=0):
155
+ num = torch.where(torch.isnan(x), torch.full_like(x, 0), torch.full_like(x, 1)).sum(axis=axis)
156
+ value = torch.where(torch.isnan(x), torch.full_like(x, 0), x).sum(axis=axis)
157
+ mean = value / num
158
+ mean_broadcast = torch.repeat_interleave(mean.unsqueeze(axis), x.shape[axis], dim=axis)
159
+ return torch.sqrt(torch.nansum(torch.square(mean_broadcast - x), axis=axis) / (num - 1))
160
+
161
+ def normalize_data(data, normalize_positions=-1):
162
+ if normalize_positions > 0:
163
+ mean = torch_nanmean(data[:normalize_positions], axis=0)
164
+ std = torch_nanstd(data[:normalize_positions], axis=0) + .000001
165
+ else:
166
+ mean = torch_nanmean(data, axis=0)
167
+ std = torch_nanstd(data, axis=0) + .000001
168
+ data = (data - mean) / std
169
+ data = torch.clip(data, min=-100, max=100)
170
+
171
+ return data
172
+
173
+ def remove_outliers(X, n_sigma=4):
174
+ # Expects T, B, H
175
+ assert len(X.shape) == 3, "X must be T,B,H"
176
+ #for b in range(X.shape[1]):
177
+ #for col in range(X.shape[2]):
178
+ data = X
179
+ data_mean, data_std = torch_nanmean(data, axis=0), torch_nanstd(data, axis=0)
180
+ cut_off = data_std * n_sigma
181
+ lower, upper = data_mean - cut_off, data_mean + cut_off
182
+
183
+ data_clean = X[:].clone()
184
+ data_clean[torch.logical_or(data > upper, data < lower)] = np.nan
185
+ data_mean, data_std = torch_nanmean(data_clean, axis=0), torch_nanstd(data_clean, axis=0)
186
+ cut_off = data_std * n_sigma
187
+ lower, upper = data_mean - cut_off, data_mean + cut_off
188
+
189
+ X = torch.maximum(-torch.log(1+torch.abs(X)) + lower, X)
190
+ X = torch.minimum(torch.log(1+torch.abs(X)) + upper, X)
191
+ # print(ds[1][data < lower, col], ds[1][data > upper, col], ds[1][~np.isnan(data), col].shape, data_mean, data_std)
192
+ return X
193
+
194
+ def bool_mask_to_att_mask(mask):
195
+ return mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
196
+
197
+ def print_on_master_only(is_master):
198
+ import builtins as __builtin__
199
+
200
+ builtin_print = __builtin__.print
201
+
202
+ def print(*args, **kwargs):
203
+ force = kwargs.pop("force", False)
204
+ if is_master or force:
205
+ builtin_print(*args, **kwargs)
206
+
207
+ __builtin__.print = print
208
+
209
+ def init_dist(device):
210
+ if 'SLURM_PROCID' in os.environ and torch.cuda.device_count() > 1:
211
+ assert device != 'cpu:0'
212
+ rank = int(os.environ['SLURM_PROCID'])
213
+ os.environ['MASTER_ADDR'] = 'localhost'
214
+ os.environ['MASTER_PORT'] = '12355'
215
+ torch.cuda.set_device(rank)
216
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(rank)
217
+ torch.distributed.init_process_group(backend="nccl", init_method="env://", timeout=datetime.timedelta(seconds=20),
218
+ world_size=torch.cuda.device_count(), rank=rank)
219
+ torch.distributed.barrier()
220
+ print_on_master_only(rank == 0)
221
+ print(f"Distributed training on {torch.cuda.device_count()} GPUs, this is rank {rank}, "
222
+ "only I can print, but when using print(..., force=True) it will print on all ranks.")
223
+
224
+ return True, rank, f'cuda:{rank}'
225
+ else:
226
+ print('Not using distributed')
227
+ # will not change any of the behavior of print, but allows putting the force=True in the print calls
228
+ print_on_master_only(True)
229
+ return False, 0, device
230
+
231
+ # NOP function for python with statements (x = NOP(); with x:)
232
+ class NOP():
233
+ def __enter__(self):
234
+ pass
235
+ def __exit__(self, type, value, traceback):
236
+ pass
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ tabpfn_path = 'TabPFN'
3
+ sys.path.insert(0, tabpfn_path) # our submodule of the TabPFN repo (at 045c8400203ebd062346970b4f2c0ccda5a40618)
4
+ from TabPFN.scripts.transformer_prediction_interface import TabPFNClassifier
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import torch
9
+ import gradio as gr
10
+ import openml
11
+
12
+
13
+ def compute(table: np.array):
14
+ vfunc = np.vectorize(lambda s: len(s))
15
+ non_empty_row_mask = (vfunc(table).sum(1) != 0)
16
+ table = table[non_empty_row_mask]
17
+ empty_mask = table == ''
18
+ empty_inds = np.where(empty_mask)
19
+ if not len(empty_inds[0]):
20
+ return "**Please leave at least one field blank for prediction.**", None
21
+ if not np.all(empty_inds[1][0] == empty_inds[1]):
22
+ return "**Please only leave fields of one column blank for prediction.**", None
23
+ y_column = empty_inds[1][0]
24
+ eval_lines = empty_inds[0]
25
+
26
+ train_table = np.delete(table, eval_lines, axis=0)
27
+ eval_table = table[eval_lines]
28
+
29
+ try:
30
+ x_train = torch.tensor(np.delete(train_table, y_column, axis=1).astype(np.float32))
31
+ x_eval = torch.tensor(np.delete(eval_table, y_column, axis=1).astype(np.float32))
32
+
33
+ y_train = train_table[:, y_column]
34
+ except ValueError:
35
+ return "**Please only add numbers (to the inputs) or leave fields empty.**", None
36
+
37
+ classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')
38
+ classifier.fit(x_train, y_train)
39
+ y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True)
40
+
41
+ # print(file, type(file))
42
+ out_table = table.copy().astype(str)
43
+ out_table[eval_lines, y_column] = [f"{y_e} (p={p_e:.2f})" for y_e, p_e in zip(y_eval, p_eval)]
44
+ return None, out_table
45
+
46
+
47
+ def upload_file(file):
48
+ if file.name.endswith('.arff'):
49
+ dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
50
+ X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
51
+ dataset_format="array"
52
+ )
53
+ df = pd.DataFrame(X_, columns=attribute_names_)
54
+ return df
55
+ elif file.name.endswith('.csv') or file.name.endswith('.data'):
56
+ df = pd.read_csv(file.name, header=None)
57
+ df.columns = np.arange(len(df.columns))
58
+ print(df)
59
+ return df
60
+
61
+
62
+ example = \
63
+ [
64
+ [1, 2, 1],
65
+ [2, 1, 1],
66
+ [1, 1, 1],
67
+ [2, 2, 2],
68
+ [3, 4, 2],
69
+ [3, 2, 2],
70
+ [2, 3, '']
71
+ ]
72
+
73
+ with gr.Blocks() as demo:
74
+ gr.Markdown("""This demo allows you to play with the **TabPFN**.
75
+ You can either change the table manually (we have filled it with a toy benchmark, sum up to 3 has label 1 and over that label 2).
76
+ The network predicts fields you leave empty. Only one column can have empty entries that are predicted.
77
+ Please, provide everything but the label column as numeric values. It is ok to encode classes as integers.
78
+ """)
79
+ inp_table = gr.DataFrame(type='numpy', value=example, headers=[''] * 3)
80
+ inp_file = gr.File(
81
+ label='Drop either a .csv (without header, only numeric values for all but the labels) or a .arff file.')
82
+ examples = gr.Examples(examples=['iris.csv', 'balance-scale.arff'],
83
+ inputs=[inp_file],
84
+ outputs=[inp_table],
85
+ fn=upload_file,
86
+ cache_examples=True)
87
+ btn = gr.Button("Predict Empty Table Cells")
88
+
89
+ inp_file.change(fn=upload_file, inputs=inp_file, outputs=inp_table)
90
+
91
+ out_text = gr.Markdown()
92
+ out_table = gr.DataFrame()
93
+
94
+ btn.click(fn=compute, inputs=inp_table, outputs=[out_text, out_table])
95
+
96
+ demo.launch()
balance-scale.arff ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %1. Title: Balance Scale Weight & Distance Database
2
+ %
3
+ %2. Source Information:
4
+ % (a) Source: Generated to model psychological experiments reported
5
+ % by Siegler, R. S. (1976). Three Aspects of Cognitive
6
+ % Development. Cognitive Psychology, 8, 481-520.
7
+ % (b) Donor: Tim Hume (hume@ics.uci.edu)
8
+ % (c) Date: 22 April 1994
9
+ %
10
+ %3. Past Usage: (possibly different formats of this data)
11
+ % - Publications
12
+ % 1. Klahr, D., & Siegler, R.S. (1978). The Representation of
13
+ % Children's Knowledge. In H. W. Reese & L. P. Lipsitt (Eds.),
14
+ % Advances in Child Development and Behavior, pp. 61-116. New
15
+ % York: Academic Press
16
+ % 2. Langley,P. (1987). A General Theory of Discrimination
17
+ % Learning. In D. Klahr, P. Langley, & R. Neches (Eds.),
18
+ % Production System Models of Learning and Development, pp.
19
+ % 99-161. Cambridge, MA: MIT Press
20
+ % 3. Newell, A. (1990). Unified Theories of Cognition.
21
+ % Cambridge, MA: Harvard University Press
22
+ % 4. McClelland, J.L. (1988). Parallel Distibuted Processing:
23
+ % Implications for Cognition and Development. Technical
24
+ % Report AIP-47, Department of Psychology, Carnegie-Mellon
25
+ % University
26
+ % 5. Shultz, T., Mareschal, D., & Schmidt, W. (1994). Modeling
27
+ % Cognitive Development on Balance Scale Phenomena. Machine
28
+ % Learning, Vol. 16, pp. 59-88.
29
+ %
30
+ %4. Relevant Information:
31
+ % This data set was generated to model psychological
32
+ % experimental results. Each example is classified as having the
33
+ % balance scale tip to the right, tip to the left, or be
34
+ % balanced. The attributes are the left weight, the left
35
+ % distance, the right weight, and the right distance. The
36
+ % correct way to find the class is the greater of
37
+ % (left-distance * left-weight) and (right-distance *
38
+ % right-weight). If they are equal, it is balanced.
39
+ %
40
+ %5. Number of Instances: 625 (49 balanced, 288 left, 288 right)
41
+ %
42
+ %6. Number of Attributes: 4 (numeric) + class name = 5
43
+ %
44
+ %7. Attribute Information:
45
+ % 1. Class Name: 3 (L, B, R)
46
+ % 2. Left-Weight: 5 (1, 2, 3, 4, 5)
47
+ % 3. Left-Distance: 5 (1, 2, 3, 4, 5)
48
+ % 4. Right-Weight: 5 (1, 2, 3, 4, 5)
49
+ % 5. Right-Distance: 5 (1, 2, 3, 4, 5)
50
+ %
51
+ %8. Missing Attribute Values:
52
+ % none
53
+ %
54
+ %9. Class Distribution:
55
+ % 1. 46.08 percent are L
56
+ % 2. 07.84 percent are B
57
+ % 3. 46.08 percent are R
58
+ %
59
+
60
+ @relation balance-scale
61
+ @attribute 'left-weight' real
62
+ @attribute 'left-distance' real
63
+ @attribute 'right-weight' real
64
+ @attribute 'right-distance' real
65
+ @attribute 'class' { L, B, R}
66
+ @data
67
+ 1,1,1,1,B
68
+ 1,1,1,2,R
69
+ 1,1,1,3,R
70
+ 1,1,1,4,R
71
+ 1,1,1,5,R
72
+ 1,1,2,1,R
73
+ 1,1,2,2,R
74
+ 1,1,2,3,R
75
+ 1,1,2,4,R
76
+ 1,1,2,5,R
77
+ 1,1,3,1,R
78
+ 1,1,3,2,R
79
+ 1,1,3,3,R
80
+ 1,1,3,4,R
81
+ 1,1,3,5,R
82
+ 1,1,4,1,R
83
+ 1,1,4,2,R
84
+ 1,1,4,3,R
85
+ 1,1,4,4,R
86
+ 1,1,4,5,R
87
+ 1,1,5,1,R
88
+ 1,1,5,2,R
89
+ 1,1,5,3,R
90
+ 1,1,5,4,R
91
+ 1,1,5,5,R
92
+ 1,2,1,1,L
93
+ 1,2,1,2,B
94
+ 1,2,1,3,R
95
+ 1,2,1,4,R
96
+ 1,2,1,5,R
97
+ 1,2,2,1,B
98
+ 1,2,2,2,R
99
+ 1,2,2,3,R
100
+ 1,2,2,4,R
101
+ 1,2,2,5,R
102
+ 1,2,3,1,R
103
+ 1,2,3,2,R
104
+ 1,2,3,3,R
105
+ 1,2,3,4,R
106
+ 1,2,3,5,R
107
+ 1,2,4,1,R
108
+ 1,2,4,2,R
109
+ 1,2,4,3,R
110
+ 1,2,4,4,R
111
+ 1,2,4,5,R
112
+ 1,2,5,1,R
113
+ 1,2,5,2,R
114
+ 1,2,5,3,R
115
+ 1,2,5,4,R
116
+ 1,2,5,5,R
117
+ 1,3,1,1,L
118
+ 1,3,1,2,L
119
+ 1,3,1,3,B
120
+ 1,3,1,4,R
121
+ 1,3,1,5,R
122
+ 1,3,2,1,L
123
+ 1,3,2,2,R
124
+ 1,3,2,3,R
125
+ 1,3,2,4,R
126
+ 1,3,2,5,R
127
+ 1,3,3,1,B
128
+ 1,3,3,2,R
129
+ 1,3,3,3,R
130
+ 1,3,3,4,R
131
+ 1,3,3,5,R
132
+ 1,3,4,1,R
133
+ 1,3,4,2,R
134
+ 1,3,4,3,R
135
+ 1,3,4,4,R
136
+ 1,3,4,5,R
137
+ 1,3,5,1,R
138
+ 1,3,5,2,R
139
+ 1,3,5,3,R
140
+ 1,3,5,4,R
141
+ 1,3,5,5,R
142
+ 1,4,1,1,L
143
+ 1,4,1,2,L
144
+ 1,4,1,3,L
145
+ 1,4,1,4,B
146
+ 1,4,1,5,R
147
+ 1,4,2,1,L
148
+ 1,4,2,2,B
149
+ 1,4,2,3,R
150
+ 1,4,2,4,R
151
+ 1,4,2,5,R
152
+ 1,4,3,1,L
153
+ 1,4,3,2,R
154
+ 1,4,3,3,R
155
+ 1,4,3,4,R
156
+ 1,4,3,5,R
157
+ 1,4,4,1,B
158
+ 1,4,4,2,R
159
+ 1,4,4,3,R
160
+ 1,4,4,4,R
161
+ 1,4,4,5,R
162
+ 1,4,5,1,R
163
+ 1,4,5,2,R
164
+ 1,4,5,3,R
165
+ 1,4,5,4,R
166
+ 1,4,5,5,R
167
+ 1,5,1,1,L
168
+ 1,5,1,2,L
169
+ 1,5,1,3,L
170
+ 1,5,1,4,L
171
+ 1,5,1,5,B
172
+ 1,5,2,1,L
173
+ 1,5,2,2,L
174
+ 1,5,2,3,R
175
+ 1,5,2,4,R
176
+ 1,5,2,5,R
177
+ 1,5,3,1,L
178
+ 1,5,3,2,R
179
+ 1,5,3,3,R
180
+ 1,5,3,4,R
181
+ 1,5,3,5,R
182
+ 1,5,4,1,L
183
+ 1,5,4,2,R
184
+ 1,5,4,3,R
185
+ 1,5,4,4,R
186
+ 1,5,4,5,R
187
+ 1,5,5,1,B
188
+ 1,5,5,2,R
189
+ 1,5,5,3,R
190
+ 1,5,5,4,R
191
+ 1,5,5,5,R
192
+ 2,1,1,1,L
193
+ 2,1,1,2,B
194
+ 2,1,1,3,R
195
+ 2,1,1,4,R
196
+ 2,1,1,5,R
197
+ 2,1,2,1,B
198
+ 2,1,2,2,R
199
+ 2,1,2,3,R
200
+ 2,1,2,4,R
201
+ 2,1,2,5,R
202
+ 2,1,3,1,R
203
+ 2,1,3,2,R
204
+ 2,1,3,3,R
205
+ 2,1,3,4,R
206
+ 2,1,3,5,R
207
+ 2,1,4,1,R
208
+ 2,1,4,2,R
209
+ 2,1,4,3,R
210
+ 2,1,4,4,R
211
+ 2,1,4,5,R
212
+ 2,1,5,1,R
213
+ 2,1,5,2,R
214
+ 2,1,5,3,R
215
+ 2,1,5,4,R
216
+ 2,1,5,5,R
217
+ 2,2,1,1,L
218
+ 2,2,1,2,L
219
+ 2,2,1,3,L
220
+ 2,2,1,4,B
221
+ 2,2,1,5,R
222
+ 2,2,2,1,L
223
+ 2,2,2,2,B
224
+ 2,2,2,3,R
225
+ 2,2,2,4,R
226
+ 2,2,2,5,R
227
+ 2,2,3,1,L
228
+ 2,2,3,2,R
229
+ 2,2,3,3,R
230
+ 2,2,3,4,R
231
+ 2,2,3,5,R
232
+ 2,2,4,1,B
233
+ 2,2,4,2,R
234
+ 2,2,4,3,R
235
+ 2,2,4,4,R
236
+ 2,2,4,5,R
237
+ 2,2,5,1,R
238
+ 2,2,5,2,R
239
+ 2,2,5,3,R
240
+ 2,2,5,4,R
241
+ 2,2,5,5,R
242
+ 2,3,1,1,L
243
+ 2,3,1,2,L
244
+ 2,3,1,3,L
245
+ 2,3,1,4,L
246
+ 2,3,1,5,L
247
+ 2,3,2,1,L
248
+ 2,3,2,2,L
249
+ 2,3,2,3,B
250
+ 2,3,2,4,R
251
+ 2,3,2,5,R
252
+ 2,3,3,1,L
253
+ 2,3,3,2,B
254
+ 2,3,3,3,R
255
+ 2,3,3,4,R
256
+ 2,3,3,5,R
257
+ 2,3,4,1,L
258
+ 2,3,4,2,R
259
+ 2,3,4,3,R
260
+ 2,3,4,4,R
261
+ 2,3,4,5,R
262
+ 2,3,5,1,L
263
+ 2,3,5,2,R
264
+ 2,3,5,3,R
265
+ 2,3,5,4,R
266
+ 2,3,5,5,R
267
+ 2,4,1,1,L
268
+ 2,4,1,2,L
269
+ 2,4,1,3,L
270
+ 2,4,1,4,L
271
+ 2,4,1,5,L
272
+ 2,4,2,1,L
273
+ 2,4,2,2,L
274
+ 2,4,2,3,L
275
+ 2,4,2,4,B
276
+ 2,4,2,5,R
277
+ 2,4,3,1,L
278
+ 2,4,3,2,L
279
+ 2,4,3,3,R
280
+ 2,4,3,4,R
281
+ 2,4,3,5,R
282
+ 2,4,4,1,L
283
+ 2,4,4,2,B
284
+ 2,4,4,3,R
285
+ 2,4,4,4,R
286
+ 2,4,4,5,R
287
+ 2,4,5,1,L
288
+ 2,4,5,2,R
289
+ 2,4,5,3,R
290
+ 2,4,5,4,R
291
+ 2,4,5,5,R
292
+ 2,5,1,1,L
293
+ 2,5,1,2,L
294
+ 2,5,1,3,L
295
+ 2,5,1,4,L
296
+ 2,5,1,5,L
297
+ 2,5,2,1,L
298
+ 2,5,2,2,L
299
+ 2,5,2,3,L
300
+ 2,5,2,4,L
301
+ 2,5,2,5,B
302
+ 2,5,3,1,L
303
+ 2,5,3,2,L
304
+ 2,5,3,3,L
305
+ 2,5,3,4,R
306
+ 2,5,3,5,R
307
+ 2,5,4,1,L
308
+ 2,5,4,2,L
309
+ 2,5,4,3,R
310
+ 2,5,4,4,R
311
+ 2,5,4,5,R
312
+ 2,5,5,1,L
313
+ 2,5,5,2,B
314
+ 2,5,5,3,R
315
+ 2,5,5,4,R
316
+ 2,5,5,5,R
317
+ 3,1,1,1,L
318
+ 3,1,1,2,L
319
+ 3,1,1,3,B
320
+ 3,1,1,4,R
321
+ 3,1,1,5,R
322
+ 3,1,2,1,L
323
+ 3,1,2,2,R
324
+ 3,1,2,3,R
325
+ 3,1,2,4,R
326
+ 3,1,2,5,R
327
+ 3,1,3,1,B
328
+ 3,1,3,2,R
329
+ 3,1,3,3,R
330
+ 3,1,3,4,R
331
+ 3,1,3,5,R
332
+ 3,1,4,1,R
333
+ 3,1,4,2,R
334
+ 3,1,4,3,R
335
+ 3,1,4,4,R
336
+ 3,1,4,5,R
337
+ 3,1,5,1,R
338
+ 3,1,5,2,R
339
+ 3,1,5,3,R
340
+ 3,1,5,4,R
341
+ 3,1,5,5,R
342
+ 3,2,1,1,L
343
+ 3,2,1,2,L
344
+ 3,2,1,3,L
345
+ 3,2,1,4,L
346
+ 3,2,1,5,L
347
+ 3,2,2,1,L
348
+ 3,2,2,2,L
349
+ 3,2,2,3,B
350
+ 3,2,2,4,R
351
+ 3,2,2,5,R
352
+ 3,2,3,1,L
353
+ 3,2,3,2,B
354
+ 3,2,3,3,R
355
+ 3,2,3,4,R
356
+ 3,2,3,5,R
357
+ 3,2,4,1,L
358
+ 3,2,4,2,R
359
+ 3,2,4,3,R
360
+ 3,2,4,4,R
361
+ 3,2,4,5,R
362
+ 3,2,5,1,L
363
+ 3,2,5,2,R
364
+ 3,2,5,3,R
365
+ 3,2,5,4,R
366
+ 3,2,5,5,R
367
+ 3,3,1,1,L
368
+ 3,3,1,2,L
369
+ 3,3,1,3,L
370
+ 3,3,1,4,L
371
+ 3,3,1,5,L
372
+ 3,3,2,1,L
373
+ 3,3,2,2,L
374
+ 3,3,2,3,L
375
+ 3,3,2,4,L
376
+ 3,3,2,5,R
377
+ 3,3,3,1,L
378
+ 3,3,3,2,L
379
+ 3,3,3,3,B
380
+ 3,3,3,4,R
381
+ 3,3,3,5,R
382
+ 3,3,4,1,L
383
+ 3,3,4,2,L
384
+ 3,3,4,3,R
385
+ 3,3,4,4,R
386
+ 3,3,4,5,R
387
+ 3,3,5,1,L
388
+ 3,3,5,2,R
389
+ 3,3,5,3,R
390
+ 3,3,5,4,R
391
+ 3,3,5,5,R
392
+ 3,4,1,1,L
393
+ 3,4,1,2,L
394
+ 3,4,1,3,L
395
+ 3,4,1,4,L
396
+ 3,4,1,5,L
397
+ 3,4,2,1,L
398
+ 3,4,2,2,L
399
+ 3,4,2,3,L
400
+ 3,4,2,4,L
401
+ 3,4,2,5,L
402
+ 3,4,3,1,L
403
+ 3,4,3,2,L
404
+ 3,4,3,3,L
405
+ 3,4,3,4,B
406
+ 3,4,3,5,R
407
+ 3,4,4,1,L
408
+ 3,4,4,2,L
409
+ 3,4,4,3,B
410
+ 3,4,4,4,R
411
+ 3,4,4,5,R
412
+ 3,4,5,1,L
413
+ 3,4,5,2,L
414
+ 3,4,5,3,R
415
+ 3,4,5,4,R
416
+ 3,4,5,5,R
417
+ 3,5,1,1,L
418
+ 3,5,1,2,L
419
+ 3,5,1,3,L
420
+ 3,5,1,4,L
421
+ 3,5,1,5,L
422
+ 3,5,2,1,L
423
+ 3,5,2,2,L
424
+ 3,5,2,3,L
425
+ 3,5,2,4,L
426
+ 3,5,2,5,L
427
+ 3,5,3,1,L
428
+ 3,5,3,2,L
429
+ 3,5,3,3,L
430
+ 3,5,3,4,L
431
+ 3,5,3,5,B
432
+ 3,5,4,1,L
433
+ 3,5,4,2,L
434
+ 3,5,4,3,L
435
+ 3,5,4,4,R
436
+ 3,5,4,5,R
437
+ 3,5,5,1,L
438
+ 3,5,5,2,L
439
+ 3,5,5,3,B
440
+ 3,5,5,4,R
441
+ 3,5,5,5,R
442
+ 4,1,1,1,L
443
+ 4,1,1,2,L
444
+ 4,1,1,3,L
445
+ 4,1,1,4,B
446
+ 4,1,1,5,R
447
+ 4,1,2,1,L
448
+ 4,1,2,2,B
449
+ 4,1,2,3,R
450
+ 4,1,2,4,R
451
+ 4,1,2,5,R
452
+ 4,1,3,1,L
453
+ 4,1,3,2,R
454
+ 4,1,3,3,R
455
+ 4,1,3,4,R
456
+ 4,1,3,5,R
457
+ 4,1,4,1,B
458
+ 4,1,4,2,R
459
+ 4,1,4,3,R
460
+ 4,1,4,4,R
461
+ 4,1,4,5,R
462
+ 4,1,5,1,R
463
+ 4,1,5,2,R
464
+ 4,1,5,3,R
465
+ 4,1,5,4,R
466
+ 4,1,5,5,R
467
+ 4,2,1,1,L
468
+ 4,2,1,2,L
469
+ 4,2,1,3,L
470
+ 4,2,1,4,L
471
+ 4,2,1,5,L
472
+ 4,2,2,1,L
473
+ 4,2,2,2,L
474
+ 4,2,2,3,L
475
+ 4,2,2,4,B
476
+ 4,2,2,5,R
477
+ 4,2,3,1,L
478
+ 4,2,3,2,L
479
+ 4,2,3,3,R
480
+ 4,2,3,4,R
481
+ 4,2,3,5,R
482
+ 4,2,4,1,L
483
+ 4,2,4,2,B
484
+ 4,2,4,3,R
485
+ 4,2,4,4,R
486
+ 4,2,4,5,R
487
+ 4,2,5,1,L
488
+ 4,2,5,2,R
489
+ 4,2,5,3,R
490
+ 4,2,5,4,R
491
+ 4,2,5,5,R
492
+ 4,3,1,1,L
493
+ 4,3,1,2,L
494
+ 4,3,1,3,L
495
+ 4,3,1,4,L
496
+ 4,3,1,5,L
497
+ 4,3,2,1,L
498
+ 4,3,2,2,L
499
+ 4,3,2,3,L
500
+ 4,3,2,4,L
501
+ 4,3,2,5,L
502
+ 4,3,3,1,L
503
+ 4,3,3,2,L
504
+ 4,3,3,3,L
505
+ 4,3,3,4,B
506
+ 4,3,3,5,R
507
+ 4,3,4,1,L
508
+ 4,3,4,2,L
509
+ 4,3,4,3,B
510
+ 4,3,4,4,R
511
+ 4,3,4,5,R
512
+ 4,3,5,1,L
513
+ 4,3,5,2,L
514
+ 4,3,5,3,R
515
+ 4,3,5,4,R
516
+ 4,3,5,5,R
517
+ 4,4,1,1,L
518
+ 4,4,1,2,L
519
+ 4,4,1,3,L
520
+ 4,4,1,4,L
521
+ 4,4,1,5,L
522
+ 4,4,2,1,L
523
+ 4,4,2,2,L
524
+ 4,4,2,3,L
525
+ 4,4,2,4,L
526
+ 4,4,2,5,L
527
+ 4,4,3,1,L
528
+ 4,4,3,2,L
529
+ 4,4,3,3,L
530
+ 4,4,3,4,L
531
+ 4,4,3,5,L
532
+ 4,4,4,1,L
533
+ 4,4,4,2,L
534
+ 4,4,4,3,L
535
+ 4,4,4,4,B
536
+ 4,4,4,5,R
537
+ 4,4,5,1,L
538
+ 4,4,5,2,L
539
+ 4,4,5,3,L
540
+ 4,4,5,4,R
541
+ 4,4,5,5,R
542
+ 4,5,1,1,L
543
+ 4,5,1,2,L
544
+ 4,5,1,3,L
545
+ 4,5,1,4,L
546
+ 4,5,1,5,L
547
+ 4,5,2,1,L
548
+ 4,5,2,2,L
549
+ 4,5,2,3,L
550
+ 4,5,2,4,L
551
+ 4,5,2,5,L
552
+ 4,5,3,1,L
553
+ 4,5,3,2,L
554
+ 4,5,3,3,L
555
+ 4,5,3,4,L
556
+ 4,5,3,5,L
557
+ 4,5,4,1,L
558
+ 4,5,4,2,L
559
+ 4,5,4,3,L
560
+ 4,5,4,4,L
561
+ 4,5,4,5,B
562
+ 4,5,5,1,L
563
+ 4,5,5,2,L
564
+ 4,5,5,3,L
565
+ 4,5,5,4,B
566
+ 4,5,5,5,R
567
+ 5,1,1,1,L
568
+ 5,1,1,2,L
569
+ 5,1,1,3,L
570
+ 5,1,1,4,L
571
+ 5,1,1,5,B
572
+ 5,1,2,1,L
573
+ 5,1,2,2,L
574
+ 5,1,2,3,R
575
+ 5,1,2,4,R
576
+ 5,1,2,5,R
577
+ 5,1,3,1,L
578
+ 5,1,3,2,R
579
+ 5,1,3,3,R
580
+ 5,1,3,4,R
581
+ 5,1,3,5,R
582
+ 5,1,4,1,L
583
+ 5,1,4,2,R
584
+ 5,1,4,3,R
585
+ 5,1,4,4,R
586
+ 5,1,4,5,R
587
+ 5,1,5,1,B
588
+ 5,1,5,2,R
589
+ 5,1,5,3,R
590
+ 5,1,5,4,R
591
+ 5,1,5,5,R
592
+ 5,2,1,1,L
593
+ 5,2,1,2,L
594
+ 5,2,1,3,L
595
+ 5,2,1,4,L
596
+ 5,2,1,5,L
597
+ 5,2,2,1,L
598
+ 5,2,2,2,L
599
+ 5,2,2,3,L
600
+ 5,2,2,4,L
601
+ 5,2,2,5,B
602
+ 5,2,3,1,L
603
+ 5,2,3,2,L
604
+ 5,2,3,3,L
605
+ 5,2,3,4,R
606
+ 5,2,3,5,R
607
+ 5,2,4,1,L
608
+ 5,2,4,2,L
609
+ 5,2,4,3,R
610
+ 5,2,4,4,R
611
+ 5,2,4,5,R
612
+ 5,2,5,1,L
613
+ 5,2,5,2,B
614
+ 5,2,5,3,R
615
+ 5,2,5,4,R
616
+ 5,2,5,5,R
617
+ 5,3,1,1,L
618
+ 5,3,1,2,L
619
+ 5,3,1,3,L
620
+ 5,3,1,4,L
621
+ 5,3,1,5,L
622
+ 5,3,2,1,L
623
+ 5,3,2,2,L
624
+ 5,3,2,3,L
625
+ 5,3,2,4,L
626
+ 5,3,2,5,L
627
+ 5,3,3,1,L
628
+ 5,3,3,2,L
629
+ 5,3,3,3,L
630
+ 5,3,3,4,L
631
+ 5,3,3,5,B
632
+ 5,3,4,1,L
633
+ 5,3,4,2,L
634
+ 5,3,4,3,L
635
+ 5,3,4,4,R
636
+ 5,3,4,5,R
637
+ 5,3,5,1,L
638
+ 5,3,5,2,L
639
+ 5,3,5,3,B
640
+ 5,3,5,4,R
641
+ 5,3,5,5,R
642
+ 5,4,1,1,L
643
+ 5,4,1,2,L
644
+ 5,4,1,3,L
645
+ 5,4,1,4,L
646
+ 5,4,1,5,L
647
+ 5,4,2,1,L
648
+ 5,4,2,2,L
649
+ 5,4,2,3,L
650
+ 5,4,2,4,L
651
+ 5,4,2,5,L
652
+ 5,4,3,1,L
653
+ 5,4,3,2,L
654
+ 5,4,3,3,L
655
+ 5,4,3,4,L
656
+ 5,4,3,5,L
657
+ 5,4,4,1,L
658
+ 5,4,4,2,L
659
+ 5,4,4,3,L
660
+ 5,4,4,4,L
661
+ 5,4,4,5,B
662
+ 5,4,5,1,L
663
+ 5,4,5,2,L
664
+ 5,4,5,3,L
665
+ 5,4,5,4,B
666
+ 5,4,5,5,R
667
+ 5,5,1,1,L
668
+ 5,5,1,2,L
669
+ 5,5,1,3,L
670
+ 5,5,1,4,L
671
+ 5,5,1,5,L
672
+ 5,5,2,1,L
673
+ 5,5,2,2,L
674
+ 5,5,2,3,L
675
+ 5,5,2,4,L
676
+ 5,5,2,5,L
677
+ 5,5,3,1,L
678
+ 5,5,3,2,L
679
+ 5,5,3,3,L
680
+ 5,5,3,4,L
681
+ 5,5,3,5,L
682
+ 5,5,4,1,L
683
+ 5,5,4,2,L
684
+ 5,5,4,3,L
685
+ 5,5,4,4,L
686
+ 5,5,4,5,L
687
+ 5,5,5,1,L
688
+ 5,5,5,2,L
689
+ 5,5,5,3,L
690
+ 5,5,5,4,L
691
+ 5,5,5,5,B
692
+ %
693
+ %
694
+ %
iris.csv ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 5.1,3.5,1.4,0.2,Iris-setosa
2
+ 4.9,3.0,1.4,0.2,Iris-setosa
3
+ 4.7,3.2,1.3,0.2,Iris-setosa
4
+ 4.6,3.1,1.5,0.2,Iris-setosa
5
+ 5.0,3.6,1.4,0.2,Iris-setosa
6
+ 5.4,3.9,1.7,0.4,Iris-setosa
7
+ 4.6,3.4,1.4,0.3,Iris-setosa
8
+ 5.0,3.4,1.5,0.2,Iris-setosa
9
+ 4.4,2.9,1.4,0.2,Iris-setosa
10
+ 4.9,3.1,1.5,0.1,Iris-setosa
11
+ 5.4,3.7,1.5,0.2,Iris-setosa
12
+ 4.8,3.4,1.6,0.2,Iris-setosa
13
+ 4.8,3.0,1.4,0.1,Iris-setosa
14
+ 4.3,3.0,1.1,0.1,Iris-setosa
15
+ 5.8,4.0,1.2,0.2,Iris-setosa
16
+ 5.7,4.4,1.5,0.4,Iris-setosa
17
+ 5.4,3.9,1.3,0.4,Iris-setosa
18
+ 5.1,3.5,1.4,0.3,Iris-setosa
19
+ 5.7,3.8,1.7,0.3,Iris-setosa
20
+ 5.1,3.8,1.5,0.3,Iris-setosa
21
+ 5.4,3.4,1.7,0.2,Iris-setosa
22
+ 5.1,3.7,1.5,0.4,Iris-setosa
23
+ 4.6,3.6,1.0,0.2,Iris-setosa
24
+ 5.1,3.3,1.7,0.5,Iris-setosa
25
+ 4.8,3.4,1.9,0.2,Iris-setosa
26
+ 5.0,3.0,1.6,0.2,Iris-setosa
27
+ 5.0,3.4,1.6,0.4,Iris-setosa
28
+ 5.2,3.5,1.5,0.2,Iris-setosa
29
+ 5.2,3.4,1.4,0.2,Iris-setosa
30
+ 4.7,3.2,1.6,0.2,Iris-setosa
31
+ 4.8,3.1,1.6,0.2,Iris-setosa
32
+ 5.4,3.4,1.5,0.4,Iris-setosa
33
+ 5.2,4.1,1.5,0.1,Iris-setosa
34
+ 5.5,4.2,1.4,0.2,Iris-setosa
35
+ 4.9,3.1,1.5,0.1,Iris-setosa
36
+ 5.0,3.2,1.2,0.2,Iris-setosa
37
+ 5.5,3.5,1.3,0.2,Iris-setosa
38
+ 4.9,3.1,1.5,0.1,Iris-setosa
39
+ 4.4,3.0,1.3,0.2,Iris-setosa
40
+ 5.1,3.4,1.5,0.2,Iris-setosa
41
+ 5.0,3.5,1.3,0.3,Iris-setosa
42
+ 4.5,2.3,1.3,0.3,Iris-setosa
43
+ 4.4,3.2,1.3,0.2,Iris-setosa
44
+ 5.0,3.5,1.6,0.6,Iris-setosa
45
+ 5.1,3.8,1.9,0.4,Iris-setosa
46
+ 4.8,3.0,1.4,0.3,Iris-setosa
47
+ 5.1,3.8,1.6,0.2,Iris-setosa
48
+ 4.6,3.2,1.4,0.2,Iris-setosa
49
+ 5.3,3.7,1.5,0.2,Iris-setosa
50
+ 5.0,3.3,1.4,0.2,Iris-setosa
51
+ 7.0,3.2,4.7,1.4,Iris-versicolor
52
+ 6.4,3.2,4.5,1.5,Iris-versicolor
53
+ 6.9,3.1,4.9,1.5,Iris-versicolor
54
+ 5.5,2.3,4.0,1.3,Iris-versicolor
55
+ 6.5,2.8,4.6,1.5,Iris-versicolor
56
+ 5.7,2.8,4.5,1.3,Iris-versicolor
57
+ 6.3,3.3,4.7,1.6,Iris-versicolor
58
+ 4.9,2.4,3.3,1.0,Iris-versicolor
59
+ 6.6,2.9,4.6,1.3,Iris-versicolor
60
+ 5.2,2.7,3.9,1.4,Iris-versicolor
61
+ 5.0,2.0,3.5,1.0,Iris-versicolor
62
+ 5.9,3.0,4.2,1.5,Iris-versicolor
63
+ 6.0,2.2,4.0,1.0,Iris-versicolor
64
+ 6.1,2.9,4.7,1.4,Iris-versicolor
65
+ 5.6,2.9,3.6,1.3,Iris-versicolor
66
+ 6.7,3.1,4.4,1.4,Iris-versicolor
67
+ 5.6,3.0,4.5,1.5,Iris-versicolor
68
+ 5.8,2.7,4.1,1.0,Iris-versicolor
69
+ 6.2,2.2,4.5,1.5,Iris-versicolor
70
+ 5.6,2.5,3.9,1.1,Iris-versicolor
71
+ 5.9,3.2,4.8,1.8,Iris-versicolor
72
+ 6.1,2.8,4.0,1.3,Iris-versicolor
73
+ 6.3,2.5,4.9,1.5,Iris-versicolor
74
+ 6.1,2.8,4.7,1.2,Iris-versicolor
75
+ 6.4,2.9,4.3,1.3,Iris-versicolor
76
+ 6.6,3.0,4.4,1.4,Iris-versicolor
77
+ 6.8,2.8,4.8,1.4,Iris-versicolor
78
+ 6.7,3.0,5.0,1.7,Iris-versicolor
79
+ 6.0,2.9,4.5,1.5,Iris-versicolor
80
+ 5.7,2.6,3.5,1.0,Iris-versicolor
81
+ 5.5,2.4,3.8,1.1,Iris-versicolor
82
+ 5.5,2.4,3.7,1.0,Iris-versicolor
83
+ 5.8,2.7,3.9,1.2,Iris-versicolor
84
+ 6.0,2.7,5.1,1.6,Iris-versicolor
85
+ 5.4,3.0,4.5,1.5,Iris-versicolor
86
+ 6.0,3.4,4.5,1.6,Iris-versicolor
87
+ 6.7,3.1,4.7,1.5,Iris-versicolor
88
+ 6.3,2.3,4.4,1.3,Iris-versicolor
89
+ 5.6,3.0,4.1,1.3,Iris-versicolor
90
+ 5.5,2.5,4.0,1.3,Iris-versicolor
91
+ 5.5,2.6,4.4,1.2,Iris-versicolor
92
+ 6.1,3.0,4.6,1.4,Iris-versicolor
93
+ 5.8,2.6,4.0,1.2,Iris-versicolor
94
+ 5.0,2.3,3.3,1.0,Iris-versicolor
95
+ 5.6,2.7,4.2,1.3,Iris-versicolor
96
+ 5.7,3.0,4.2,1.2,Iris-versicolor
97
+ 5.7,2.9,4.2,1.3,Iris-versicolor
98
+ 6.2,2.9,4.3,1.3,Iris-versicolor
99
+ 5.1,2.5,3.0,1.1,Iris-versicolor
100
+ 5.7,2.8,4.1,1.3,Iris-versicolor
101
+ 6.3,3.3,6.0,2.5,Iris-virginica
102
+ 5.8,2.7,5.1,1.9,Iris-virginica
103
+ 7.1,3.0,5.9,2.1,Iris-virginica
104
+ 6.3,2.9,5.6,1.8,Iris-virginica
105
+ 6.5,3.0,5.8,2.2,Iris-virginica
106
+ 7.6,3.0,6.6,2.1,Iris-virginica
107
+ 4.9,2.5,4.5,1.7,Iris-virginica
108
+ 7.3,2.9,6.3,1.8,Iris-virginica
109
+ 6.7,2.5,5.8,1.8,Iris-virginica
110
+ 7.2,3.6,6.1,2.5,Iris-virginica
111
+ 6.5,3.2,5.1,2.0,Iris-virginica
112
+ 6.4,2.7,5.3,1.9,Iris-virginica
113
+ 6.8,3.0,5.5,2.1,Iris-virginica
114
+ 5.7,2.5,5.0,2.0,Iris-virginica
115
+ 5.8,2.8,5.1,2.4,Iris-virginica
116
+ 6.4,3.2,5.3,2.3,Iris-virginica
117
+ 6.5,3.0,5.5,1.8,Iris-virginica
118
+ 7.7,3.8,6.7,2.2,Iris-virginica
119
+ 7.7,2.6,6.9,2.3,Iris-virginica
120
+ 6.0,2.2,5.0,1.5,Iris-virginica
121
+ 6.9,3.2,5.7,2.3,Iris-virginica
122
+ 5.6,2.8,4.9,2.0,Iris-virginica
123
+ 7.7,2.8,6.7,2.0,Iris-virginica
124
+ 6.3,2.7,4.9,1.8,Iris-virginica
125
+ 6.7,3.3,5.7,2.1,Iris-virginica
126
+ 7.2,3.2,6.0,1.8,Iris-virginica
127
+ 6.2,2.8,4.8,1.8,Iris-virginica
128
+ 6.1,3.0,4.9,1.8,Iris-virginica
129
+ 6.4,2.8,5.6,2.1,Iris-virginica
130
+ 7.2,3.0,5.8,1.6,Iris-virginica
131
+ 7.4,2.8,6.1,1.9,Iris-virginica
132
+ 7.9,3.8,6.4,2.0,Iris-virginica
133
+ 6.4,2.8,5.6,2.2,Iris-virginica
134
+ 6.3,2.8,5.1,1.5,Iris-virginica
135
+ 6.1,2.6,5.6,1.4,Iris-virginica
136
+ 7.7,3.0,6.1,2.3,Iris-virginica
137
+ 6.3,3.4,5.6,2.4,Iris-virginica
138
+ 6.4,3.1,5.5,1.8,Iris-virginica
139
+ 6.0,3.0,4.8,1.8,Iris-virginica
140
+ 6.9,3.1,5.4,2.1,Iris-virginica
141
+ 6.7,3.1,5.6,2.4,Iris-virginica
142
+ 6.9,3.1,5.1,2.3,Iris-virginica
143
+ 5.8,2.7,5.1,1.9,Iris-virginica
144
+ 6.8,3.2,5.9,2.3,Iris-virginica
145
+ 6.7,3.3,5.7,2.5,Iris-virginica
146
+ 6.7,3.0,5.2,2.3,Iris-virginica
147
+ 6.3,2.5,5.0,1.9,Iris-virginica
148
+ 6.5,3.0,5.2,2.0,Iris-virginica
149
+ 6.2,3.4,5.4,2.3,Iris-virginica
150
+ 5.9,3.0,5.1,1.8,Iris-virginica
151
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Please use python V 3.7 to be compatible with all packages
2
+ gpytorch==1.5.0
3
+ torch==1.9.0
4
+ scikit-learn==0.24.2
5
+ pyyaml==5.4.1
6
+ seaborn==0.11.2
7
+ xgboost==1.4.0
8
+ tqdm==4.62.1
9
+ numpy==1.21.2
10
+ openml==0.12.2
11
+ catboost==0.26.1
12
+ auto-sklearn==0.14.5
13
+ hyperopt==0.2.5
14
+ configspace==0.4.21
15
+ # autogluon==0.4.0
16
+ gradio==3.1.1