Spaces:

TabPFN
/

TabPFNPrediction

Runtime error

Update app.py

6482360 over 1 year ago

6.05 kB

	import sys
	tabpfn_path = 'TabPFN'
	sys.path.insert(0, tabpfn_path) # our submodule of the TabPFN repo (at 045c8400203ebd062346970b4f2c0ccda5a40618)
	from TabPFN.scripts.transformer_prediction_interface import TabPFNClassifier

	import numpy as np
	from pathlib import Path
	import pandas as pd
	import torch
	import gradio as gr
	import openml
	import os
	import matplotlib.pyplot as plt
	from matplotlib.colors import ListedColormap

	def compute(table: np.array):
	vfunc = np.vectorize(lambda s: len(str(s)))
	non_empty_row_mask = (vfunc(table).sum(1) != 0)
	table = table[non_empty_row_mask]
	empty_mask = table == '(predict)'
	empty_inds = np.where(empty_mask)
	if len(table.index) > 1024:
	return "⚠️ ERROR: TabPFN is not made for datasets with a trainingsize > 1024.", None, None
	if len(table.columns) > 100:
	return "⚠️ ERROR: TabPFN is not made for datasets with a feature size > 100.", None, None
	if not len(empty_inds[0]):
	return "⚠️ ERROR: Please leave at least one field blank for prediction.", None, None
	if not np.all(empty_inds[1][0] == empty_inds[1]):
	return "⚠️ Please only leave fields of one column blank for prediction.", None, None
	y_column = empty_inds[1][0]
	eval_lines = empty_inds[0]

	train_table = np.delete(table, eval_lines, axis=0)
	eval_table = table[eval_lines]

	try:
	x_train = torch.tensor(np.delete(train_table, y_column, axis=1).astype(np.float32))
	x_eval = torch.tensor(np.delete(eval_table, y_column, axis=1).astype(np.float32))

	y_train = train_table[:, y_column]
	except ValueError:
	return "⚠️ Please only add numbers (to the inputs) or leave fields empty.", None, None

	classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')
	classifier.fit(x_train, y_train)
	y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True)

	# print(file, type(file))
	out_table = pd.DataFrame(table.copy().astype(str))
	out_table.iloc[eval_lines, y_column] = [f"{y_e} (p={p_e:.2f})" for y_e, p_e in zip(y_eval, p_eval)]
	out_table = out_table.iloc[eval_lines, :]
	out_table.columns = headers

	# PLOTTING
	fig = plt.figure(figsize=(10,10))
	ax = fig.add_subplot(111)
	cm = plt.cm.RdBu
	cm_bright = ListedColormap(["#FF0000", "#0000FF"])

	# Plot the training points
	vfunc = np.vectorize(lambda x : np.where(classifier.classes_ == x)[0])
	y_train_index = vfunc(y_train)
	y_train_index = y_train_index == 0
	y_train = y_train_index
	#x_train = x_train[y_train_index <= 1]
	#y_train = y_train[y_train_index <= 1]
	#y_train_index = y_train_index[y_train_index <= 1]

	ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train_index, cmap=cm_bright)

	classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')
	classifier.fit(x_train[:, 0:2], y_train)

	DecisionBoundaryDisplay.from_estimator(
	classifier, x_train[:, 0:2], alpha=0.6, ax=ax, eps=2.0, grid_resolution=100, response_method="predict_proba"
	)
	plt.xlabel(headers[0])
	plt.ylabel(headers[1])

	return None, out_table, fig


	def upload_file(file, remove_entries=10):
	global headers
	if file.name.endswith('.arff'):
	dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
	X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
	dataset_format="array"
	)
	df = pd.DataFrame(X_, columns=attribute_names_)
	headers = np.arange(len(df.columns))
	df.columns = headers
	elif file.name.endswith('.csv') or file.name.endswith('.data'):
	df = pd.read_csv(file.name, header='infer')
	headers = np.arange(len(df.columns))
	df.columns = headers

	df.iloc[0:remove_entries, -1] = ''
	return df


	def update_table(table):
	global headers
	table = pd.DataFrame(table)
	vfunc = np.vectorize(lambda s: len(str(s)))
	non_empty_row_mask = (vfunc(table).sum(1) != 0)
	table = table[non_empty_row_mask]
	empty_mask = table == ''
	empty_inds = np.where(empty_mask)
	if not len(empty_inds[0]):
	return table

	y_column = empty_inds[1][0]
	eval_lines = empty_inds[0]

	table.iloc[eval_lines, y_column] = ''
	table.columns = headers

	return table

	headers = []

	gr.Markdown("""This demo allows you to experiment with the TabPFN model for tabular data.

	If you remove values in the target column, TabPFN will make predictions on them after clicking on the Button. The first 10 target values were already removed for this example dataset, so TabPFN will predict the first 10 classes.
	Please, provide everything but the targets as numeric values and only remove values in one column (the target column).
	""")

	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	inp_table = gr.DataFrame(type='numpy', value=upload_file(Path('iris.csv'), remove_entries=10)
	, headers=[''] * 3)

	inp_file = gr.File(
	label='Drop either a .csv (without header, only numeric values for all but the labels) or a .arff file.')

	examples = gr.Examples(examples=['iris.csv', 'balance-scale.arff'],
	inputs=[inp_file],
	outputs=[inp_table],
	fn=upload_file,
	cache_examples=True)

	#inp_table.change(fn=update_table, inputs=inp_table, outputs=inp_table)

	with gr.Column():

	btn = gr.Button("Calculate Predictions")
	out_text = gr.Markdown()
	out_plot = gr.Plot(type="Matplotlib")
	out_table = gr.DataFrame()

	btn.click(fn=compute, inputs=inp_table, outputs=[out_text, out_table, out_plot])

	inp_file.change(fn=upload_file, inputs=inp_file, outputs=inp_table)

	demo.launch(share=True)