File size: 3,305 Bytes
e487255
 
 
 
 
 
 
 
 
 
a833f5f
e487255
 
a833f5f
 
a07780a
a833f5f
 
 
 
 
a07780a
a833f5f
 
 
a07780a
a833f5f
 
 
 
 
e487255
 
a833f5f
 
 
 
 
e487255
 
a833f5f
 
a07780a
e487255
 
 
a833f5f
 
e487255
 
a833f5f
e487255
a833f5f
 
 
 
e487255
 
 
 
a07780a
e487255
 
e3f4b64
a833f5f
 
 
 
 
a07780a
e487255
a833f5f
e487255
 
 
a833f5f
 
e487255
a07780a
e487255
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import sys
tabpfn_path = 'TabPFN'
sys.path.insert(0, tabpfn_path) # our submodule of the TabPFN repo (at 045c8400203ebd062346970b4f2c0ccda5a40618)
from TabPFN.scripts.transformer_prediction_interface import TabPFNClassifier

import numpy as np
import pandas as pd
import torch
import gradio as gr
import openml
from sklearn.model_selection import cross_val_score


def compute(file, y_attribute, cv_folds):
    if file is None:
        return 'Please upload a .arff file', y_attribute
    if file.name.endswith('.arff'):
        dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
        X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
            dataset_format="array")
        if y_attribute not in attribute_names_:
            return f"**Select attribute from {', '.join(attribute_names_)}**", y_attribute
        X, y, categorical_indicator_, attribute_names_ = dataset.get_data(
            dataset_format="array", target=y_attribute)
    else:
        return 'Please upload a .arff file', y_attribute

    order = np.arange(y.shape[0])
    np.random.seed(13)
    np.random.shuffle(order)
    X, y = torch.tensor(X[order]), torch.tensor(y[order])

    classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')

    scores = cross_val_score(classifier, X, y, cv=cv_folds, scoring='roc_auc_ovo')
    print(scores)
    # classifier.fit(x_train, y_train)
    # y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True)

    # print(file, type(file))
    return f"ROC AUC OVO Cross Val mean is {sum(scores) / len(scores)} from {scores}. " + (
        "The PFN is only trained for datasets with up to 1024 training examples and it had to extrapolate to greater datasets for this evaluation." if len(
            y) // cv_folds > 1024 else ""), y_attribute


def upload_file(file):
    if file is None:
        return
    if file.name.endswith('.arff'):
        dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
        print(y_attribute)
        X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
            dataset_format="array")
        return f"Select attribute from {', '.join(attribute_names_)}", attribute_names_[-1]
    else:
        return 'Please upload a .arff file', None


with gr.Blocks() as demo:
    gr.Markdown("""This demo allows you to play with the **TabPFN**.
    Upload a .arff file, select an attribute to predict and the number of cross validation folds and get the ROC AUC OVO score for one seed.
    """)
    inp_file = gr.File(
        label='Drop a .arff file.')
    cv_folds = gr.Dropdown([2, 3, 4, 5], value=2, label='Number of CV folds')
    out_text = gr.Markdown()

    y_attribute = gr.Textbox(label='y attribute')

    examples = gr.Examples(examples=['balance-scale.arff'],
                           inputs=[inp_file],
                           outputs=[out_text, y_attribute],
                           fn=upload_file,
                           cache_examples=True)
    btn = gr.Button("Predict Empty Table Cells")
    # out_table = gr.DataFrame()
    inp_file.change(fn=upload_file, inputs=inp_file, outputs=[out_text, y_attribute])

    btn.click(fn=compute, inputs=[inp_file, y_attribute, cv_folds], outputs=[out_text, y_attribute])

demo.launch()