This notebook shows how to use TabPFN for tabular prediction with a scikit learn wrapper.

classifier = TabPFNClassifier(device='cpu')
classifier.fit(train_xs, train_ys)
prediction_ = classifier.predict(test_xs)

The fit function does not perform any computations, but only saves the training data. Computations are only done at inference time, when calling predict.
Note that the presaved models were trained for up to 100 features, 10 classes and 1000 samples. While the model does not have a hard bound on the number of samples, the features and classes are restricted and larger sizes lead to an error.

### Setup

In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
import time
import torch
import numpy as np
import os
import random

from model_builder import get_model, get_default_spec, save_model, load_model
from scripts.transformer_prediction_interface import transformer_predict, get_params_from_config, TabPFNClassifier

from datasets import load_openml_list, open_cc_dids, open_cc_valid_dids

from scripts import tabular_metrics

In [None]:
base_path = '.'

### Load datasets

In [None]:
max_samples = 10000
bptt = 10000

cc_test_datasets_multiclass, cc_test_datasets_multiclass_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = max_samples, num_feats=100, return_capped=True)
cc_valid_datasets_multiclass, cc_valid_datasets_multiclass_df = load_openml_list(open_cc_valid_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = max_samples, num_feats=100, return_capped=True)

# Loading longer OpenML Datasets for generalization experiments (optional)
# test_datasets_multiclass, test_datasets_multiclass_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 10000, num_feats=100, return_capped=True)

random.seed(0)
random.shuffle(cc_valid_datasets_multiclass)

In [None]:
from datasets import get_openml_classification

In [None]:
dataset = openml.datasets.get_dataset(31)
X, y, categorical_indicator, attribute_names = dataset.get_data(
 dataset_format="array", target=dataset.default_target_attribute
 )

In [None]:
def get_datasets(selector, task_type, suite='cc'):
 if task_type == 'binary':
 ds = valid_datasets_binary if selector == 'valid' else test_datasets_binary
 else:
 if suite == 'openml':
 ds = valid_datasets_multiclass if selector == 'valid' else test_datasets_multiclass
 elif suite == 'cc':
 ds = cc_valid_datasets_multiclass if selector == 'valid' else cc_test_datasets_multiclass
 else:
 raise Exception("Unknown suite")
 return ds

In [None]:
model_string, longer, task_type = '', 1, 'multiclass'
eval_positions = [1000]
bptt = 2000
 
test_datasets, valid_datasets = get_datasets('test', task_type, suite='cc'), get_datasets('valid', task_type, suite='cc')

### Select a dataset for prediction

In [None]:
[(i, test_datasets[i][0]) for i in range(len(test_datasets))]

In [None]:
evaluation_dataset_index = 4 # Index of the dataset to predict
ds = test_datasets[evaluation_dataset_index]
print(f'Evaluation dataset name: {ds[0]} shape {ds[1].shape}')

In [None]:
xs, ys = ds[1].clone(), ds[2].clone()
eval_position = xs.shape[0] // 2
train_xs, train_ys = xs[0:eval_position], ys[0:eval_position]
test_xs, test_ys = xs[eval_position:], ys[eval_position:]

### Predict using a Fitted and Tuned Model

In [None]:
classifier = TabPFNClassifier(device='cpu')
classifier.fit(train_xs, train_ys)
prediction_ = classifier.predict_proba(test_xs)

In [None]:
roc, ce = tabular_metrics.auc_metric(test_ys, prediction_), tabular_metrics.cross_entropy(test_ys, prediction_)
'AUC', float(roc), 'Cross Entropy', float(ce)