In [1]:
import numpy as np

import openml
import pandas as pd

In [2]:
from tqdm import tqdm

from datasets import load_openml_list, test_dids_classification, valid_large_classification, open_cc_dids, open_cc_valid_dids


In [6]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
 %reload_ext autoreload


### Prepare test datasets

In [7]:
renamer = {'name': 'Name', 'NumberOfFeatures': '# Features', 'NumberOfSymbolicFeatures': '# Categorical Features', 'NumberOfInstances': '# Instances', 'NumberOfMissingValues': '# NaNs', 'NumberOfClasses': '# Classes', 'MinorityClassSize': 'Minority Class Size'}


In [8]:
openml.study.list_suites()

OrderedDict([(99,
 {'id': 99,
 'alias': 'OpenML-CC18',
 'main_entity_type': 'task',
 'name': 'OpenML-CC18 Curated Classification benchmark',
 'status': 'active',
 'creation_date': '2019-02-21 18:47:13',
 'creator': 1}),
 (225,
 {'id': 225,
 'alias': 'OpenML-friendly',
 'main_entity_type': 'task',
 'name': 'OpenML100-friendly',
 'status': 'active',
 'creation_date': '2019-09-16 19:41:46',
 'creator': 1})])

In [9]:
suite = openml.study.get_suite(suite_id=99)
tasks = openml.tasks.list_tasks(output_format="dataframe")

In [10]:
# Using ``@`` in `pd.DataFrame.query <
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
# accesses variables outside of the current dataframe.
tasks = tasks.query("tid in @suite.tasks")

In [11]:
tids = list(tasks[np.logical_and(np.logical_and((tasks.NumberOfInstances <= 2000), (tasks.NumberOfFeatures <= 100))
 , (tasks.NumberOfClasses <= 10))].tid)

In [12]:
len(tids)

30

In [13]:
tids = list(tasks[tasks.NumberOfInstances <= 2000].tid)

In [14]:
open_cc_dids = [openml.tasks.get_task(task_id).get_dataset().id for task_id in tids]

In [None]:
open_ml_datasets, open_ml_datasets_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 100000, num_feats=100, return_capped=True)


In [16]:
open_ml_datasets_df = open_ml_datasets_df[open_ml_datasets_df.NumberOfInstances > 10000]

In [17]:
print_table = open_ml_datasets_df
print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()
print_table['id'] = print_table.index
print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)
print_table = print_table.rename(columns=renamer)
print(print_table.to_latex(index=False))

\begin{tabular}{lrrrrrrr}
\toprule
 Name & \# Features & \# Categorical Features & \# Instances & \# Classes & \# NaNs & Minority Class Size & id \\
\midrule
 KDDCup09\_appetency & 231 & 39 & 50000 & 2 & 8024152 & 890 & 1111 \\
 airlines & 8 & 5 & 539383 & 2 & 0 & 240264 & 1169 \\
 bank-marketing & 17 & 10 & 45211 & 2 & 0 & 5289 & 1461 \\
 nomao & 119 & 30 & 34465 & 2 & 0 & 9844 & 1486 \\
 adult & 15 & 9 & 48842 & 2 & 6465 & 11687 & 1590 \\
 covertype & 55 & 45 & 581012 & 7 & 0 & 2747 & 1596 \\
 numerai28.6 & 22 & 1 & 96320 & 2 & 0 & 47662 & 23517 \\
 connect-4 & 43 & 43 & 67557 & 3 & 0 & 6449 & 40668 \\
jungle\_chess\_2pcs\_raw\_endgame\_complete & 7 & 1 & 44819 & 3 & 0 & 4335 & 41027 \\
 APSFailure & 171 & 1 & 76000 & 2 & 1078695 & 1375 & 41138 \\
 albert & 79 & 53 & 425240 & 2 & 2734000 & 212620 & 41147 \\
 MiniBooNE & 51 & 1 & 130064 & 2 & 0 & 36499 & 41150 \\
 guillermo & 4297 & 1 & 20000 & 2 & 0 & 8003 & 41159 \\
 riccardo & 4297 & 1 & 20000 & 2 & 0 & 5000 & 41161 \\
 volkert & 1

### Prepare Validation datasets

In [None]:
open_cc_datasets, open_cc_datasets_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 2000, num_feats=100, return_capped=True)

def extend_datasets(datasets, filtering = False):
 extended_datasets = {}
 i = 0
 for d in tqdm(datasets):
 if ((not 'NumberOfFeatures' in datasets[d])
 or (not 'NumberOfClasses' in datasets[d])
 or (not 'NumberOfInstances' in datasets[d])
 # or datasets[d]['NumberOfFeatures'] >= num_feats
 or datasets[d]['NumberOfClasses'] <= 0):
 print(datasets[d])
 continue
 ds = openml.datasets.get_dataset(d, download_data=False)
 if filtering and (datasets[d]['NumberOfInstances'] < 150
 or datasets[d]['NumberOfInstances'] > 2000
 or datasets[d]['NumberOfFeatures'] > 100
 or datasets[d]['NumberOfClasses'] > 10):
 continue
 extended_datasets[d] = datasets[d]
 extended_datasets[d].update(ds.qualities)
 
 return extended_datasets

# All datasets
openml_list = openml.datasets.list_datasets()
openml_list = pd.DataFrame.from_dict(openml_list, orient="index")

# Select only classification
openml_list = openml_list[~openml_list['MajorityClassSize'].isna()]

# Remove duplicated datasets
duplicated = openml_list.duplicated(subset=['MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize',
 'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances',
 'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
 'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures'], keep='first')
openml_list = openml_list[~duplicated]

duplicated = openml_list.duplicated(subset=['name'], keep='first')
openml_list = openml_list[~duplicated]

# Filter out datasets that don't have meta information or Don't fulfill other criteria
openml_list = openml_list.to_dict(orient='index')
openml_list = pd.DataFrame.from_dict(extend_datasets(openml_list, filtering=True), orient="index")

# Filter out datasets in Open CC
openml_list = openml_list[~openml_list.name.apply(lambda x: x in test_datasets_multiclass_df.name.values)]
openml_list['CFI'] = openml_list.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)
test_datasets_multiclass_df['CFI'] = test_datasets_multiclass_df.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)
openml_list = openml_list[~openml_list.CFI.apply(lambda x: x in test_datasets_multiclass_df.CFI.values)]

# Remove time series and artificial data
openml_list = openml_list[~openml_list.name.apply(lambda x: 'autoUniv' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'fri_' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'FOREX' in x)]

# Remove datasets that overlapped with Open CC closely by name
openml_list = openml_list[~openml_list.name.apply(lambda x: 'ilpd' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'car' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'pc1' in x)]

# Remove datasets that didn't load
openml_list = openml_list[~openml_list.did.apply(lambda x: x in {1065, 40589, 41496, 770, 43097, 43148, 43255, 43595, 43786, 41701})]

# Remove class skew
openml_list = openml_list[(openml_list.MinorityClassSize / openml_list.MajorityClassSize) > 0.05]
openml_list = openml_list[openml_list.AutoCorrelation != 1]

# Remove too easy
openml_list = openml_list[openml_list.CfsSubsetEval_DecisionStumpAUC != 1]

In [None]:
print_table = openml_list
print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()
print_table['id'] = print_table.index
print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)
print_table = print_table.rename(columns=renamer)
print(print_table.to_latex(index=False))