{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "import openml\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "\n", "from datasets import load_openml_list, test_dids_classification, valid_large_classification, open_cc_dids, open_cc_valid_dids\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "%load_ext autoreload\n", "\n", "%autoreload 2" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### Prepare test datasets" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "renamer = {'name': 'Name', 'NumberOfFeatures': '# Features', 'NumberOfSymbolicFeatures': '# Categorical Features', 'NumberOfInstances': '# Instances', 'NumberOfMissingValues': '# NaNs', 'NumberOfClasses': '# Classes', 'MinorityClassSize': 'Minority Class Size'}\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OrderedDict([(99,\n", " {'id': 99,\n", " 'alias': 'OpenML-CC18',\n", " 'main_entity_type': 'task',\n", " 'name': 'OpenML-CC18 Curated Classification benchmark',\n", " 'status': 'active',\n", " 'creation_date': '2019-02-21 18:47:13',\n", " 'creator': 1}),\n", " (225,\n", " {'id': 225,\n", " 'alias': 'OpenML-friendly',\n", " 'main_entity_type': 'task',\n", " 'name': 'OpenML100-friendly',\n", " 'status': 'active',\n", " 'creation_date': '2019-09-16 19:41:46',\n", " 'creator': 1})])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "openml.study.list_suites()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "suite = openml.study.get_suite(suite_id=99)\n", "tasks = openml.tasks.list_tasks(output_format=\"dataframe\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Using ``@`` in `pd.DataFrame.query <\n", "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_\n", "# accesses variables outside of the current dataframe.\n", "tasks = tasks.query(\"tid in @suite.tasks\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "tids = list(tasks[np.logical_and(np.logical_and((tasks.NumberOfInstances <= 2000), (tasks.NumberOfFeatures <= 100))\n", " , (tasks.NumberOfClasses <= 10))].tid)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "30" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tids)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "tids = list(tasks[tasks.NumberOfInstances <= 2000].tid)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "open_cc_dids = [openml.tasks.get_task(task_id).get_dataset().id for task_id in tids]" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "open_ml_datasets, open_ml_datasets_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 100000, num_feats=100, return_capped=True)\n" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "open_ml_datasets_df = open_ml_datasets_df[open_ml_datasets_df.NumberOfInstances > 10000]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\\begin{tabular}{lrrrrrrr}\n", "\\toprule\n", " Name & \\# Features & \\# Categorical Features & \\# Instances & \\# Classes & \\# NaNs & Minority Class Size & id \\\\\n", "\\midrule\n", " KDDCup09\\_appetency & 231 & 39 & 50000 & 2 & 8024152 & 890 & 1111 \\\\\n", " airlines & 8 & 5 & 539383 & 2 & 0 & 240264 & 1169 \\\\\n", " bank-marketing & 17 & 10 & 45211 & 2 & 0 & 5289 & 1461 \\\\\n", " nomao & 119 & 30 & 34465 & 2 & 0 & 9844 & 1486 \\\\\n", " adult & 15 & 9 & 48842 & 2 & 6465 & 11687 & 1590 \\\\\n", " covertype & 55 & 45 & 581012 & 7 & 0 & 2747 & 1596 \\\\\n", " numerai28.6 & 22 & 1 & 96320 & 2 & 0 & 47662 & 23517 \\\\\n", " connect-4 & 43 & 43 & 67557 & 3 & 0 & 6449 & 40668 \\\\\n", "jungle\\_chess\\_2pcs\\_raw\\_endgame\\_complete & 7 & 1 & 44819 & 3 & 0 & 4335 & 41027 \\\\\n", " APSFailure & 171 & 1 & 76000 & 2 & 1078695 & 1375 & 41138 \\\\\n", " albert & 79 & 53 & 425240 & 2 & 2734000 & 212620 & 41147 \\\\\n", " MiniBooNE & 51 & 1 & 130064 & 2 & 0 & 36499 & 41150 \\\\\n", " guillermo & 4297 & 1 & 20000 & 2 & 0 & 8003 & 41159 \\\\\n", " riccardo & 4297 & 1 & 20000 & 2 & 0 & 5000 & 41161 \\\\\n", " volkert & 181 & 1 & 58310 & 10 & 0 & 1361 & 41166 \\\\\n", " dionis & 61 & 1 & 416188 & 355 & 0 & 878 & 41167 \\\\\n", " jannis & 55 & 1 & 83733 & 4 & 0 & 1687 & 41168 \\\\\n", " helena & 28 & 1 & 65196 & 100 & 0 & 111 & 41169 \\\\\n", "\\bottomrule\n", "\\end{tabular}\n", "\n" ] } ], "source": [ "print_table = open_ml_datasets_df\n", "print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()\n", "print_table['id'] = print_table.index\n", "print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)\n", "print_table = print_table.rename(columns=renamer)\n", "print(print_table.to_latex(index=False))" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### Prepare Validation datasets" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "open_cc_datasets, open_cc_datasets_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 2000, num_feats=100, return_capped=True)\n", "\n", "def extend_datasets(datasets, filtering = False):\n", " extended_datasets = {}\n", " i = 0\n", " for d in tqdm(datasets):\n", " if ((not 'NumberOfFeatures' in datasets[d])\n", " or (not 'NumberOfClasses' in datasets[d])\n", " or (not 'NumberOfInstances' in datasets[d])\n", " # or datasets[d]['NumberOfFeatures'] >= num_feats\n", " or datasets[d]['NumberOfClasses'] <= 0):\n", " print(datasets[d])\n", " continue\n", " ds = openml.datasets.get_dataset(d, download_data=False)\n", " if filtering and (datasets[d]['NumberOfInstances'] < 150\n", " or datasets[d]['NumberOfInstances'] > 2000\n", " or datasets[d]['NumberOfFeatures'] > 100\n", " or datasets[d]['NumberOfClasses'] > 10):\n", " continue\n", " extended_datasets[d] = datasets[d]\n", " extended_datasets[d].update(ds.qualities)\n", " \n", " return extended_datasets\n", "\n", "# All datasets\n", "openml_list = openml.datasets.list_datasets()\n", "openml_list = pd.DataFrame.from_dict(openml_list, orient=\"index\")\n", "\n", "# Select only classification\n", "openml_list = openml_list[~openml_list['MajorityClassSize'].isna()]\n", "\n", "# Remove duplicated datasets\n", "duplicated = openml_list.duplicated(subset=['MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize',\n", " 'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances',\n", " 'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',\n", " 'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures'], keep='first')\n", "openml_list = openml_list[~duplicated]\n", "\n", "duplicated = openml_list.duplicated(subset=['name'], keep='first')\n", "openml_list = openml_list[~duplicated]\n", "\n", "# Filter out datasets that don't have meta information or Don't fulfill other criteria\n", "openml_list = openml_list.to_dict(orient='index')\n", "openml_list = pd.DataFrame.from_dict(extend_datasets(openml_list, filtering=True), orient=\"index\")\n", "\n", "# Filter out datasets in Open CC\n", "openml_list = openml_list[~openml_list.name.apply(lambda x: x in test_datasets_multiclass_df.name.values)]\n", "openml_list['CFI'] = openml_list.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)\n", "test_datasets_multiclass_df['CFI'] = test_datasets_multiclass_df.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)\n", "openml_list = openml_list[~openml_list.CFI.apply(lambda x: x in test_datasets_multiclass_df.CFI.values)]\n", "\n", "# Remove time series and artificial data\n", "openml_list = openml_list[~openml_list.name.apply(lambda x: 'autoUniv' in x)]\n", "openml_list = openml_list[~openml_list.name.apply(lambda x: 'fri_' in x)]\n", "openml_list = openml_list[~openml_list.name.apply(lambda x: 'FOREX' in x)]\n", "\n", "# Remove datasets that overlapped with Open CC closely by name\n", "openml_list = openml_list[~openml_list.name.apply(lambda x: 'ilpd' in x)]\n", "openml_list = openml_list[~openml_list.name.apply(lambda x: 'car' in x)]\n", "openml_list = openml_list[~openml_list.name.apply(lambda x: 'pc1' in x)]\n", "\n", "# Remove datasets that didn't load\n", "openml_list = openml_list[~openml_list.did.apply(lambda x: x in {1065, 40589, 41496, 770, 43097, 43148, 43255, 43595, 43786, 41701})]\n", "\n", "# Remove class skew\n", "openml_list = openml_list[(openml_list.MinorityClassSize / openml_list.MajorityClassSize) > 0.05]\n", "openml_list = openml_list[openml_list.AutoCorrelation != 1]\n", "\n", "# Remove too easy\n", "openml_list = openml_list[openml_list.CfsSubsetEval_DecisionStumpAUC != 1]" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print_table = openml_list\n", "print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()\n", "print_table['id'] = print_table.index\n", "print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)\n", "print_table = print_table.rename(columns=renamer)\n", "print(print_table.to_latex(index=False))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" } }, "nbformat": 4, "nbformat_minor": 4 }