from catboost import CatBoostClassifier, Pool import math from sklearn.impute import SimpleImputer import xgboost as xgb from sklearn import neighbors from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF import numpy as np from scripts import tabular_metrics import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score import time from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval, rand from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import MinMaxScaler import autosklearn.classification CV = 5 MULTITHREAD = 1 # Number of threads baselines are able to use at most param_grid, param_grid_hyperopt = {}, {} def get_scoring_direction(metric_used): # Not needed if metric_used == tabular_metrics.auc_metric: return -1 elif metric_used == tabular_metrics.cross_entropy: return 1 else: raise Exception('No scoring string found for metric') def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"): if metric_used == tabular_metrics.auc_metric: if usage == 'sklearn_cv': return 'roc_auc_ovo' elif usage == 'autogluon': return 'log_loss' # Autogluon crashes when using 'roc_auc' with some datasets usning logloss gives better scores; # We might be able to fix this, but doesn't work out of box. # File bug report? Error happens with dataset robert and fabert if multiclass: return 'roc_auc_ovo_macro' else: return 'roc_auc' elif usage == 'autosklearn': if multiclass: return autosklearn.metrics.log_loss # roc_auc only works for binary, use logloss instead else: return autosklearn.metrics.roc_auc elif usage == 'catboost': return 'MultiClass' # Effectively LogLoss, ROC not available elif usage == 'xgb': return 'logloss' return 'roc_auc' elif metric_used == tabular_metrics.cross_entropy: if usage == 'sklearn_cv': return 'neg_log_loss' elif usage == 'autogluon': return 'log_loss' elif usage == 'autosklearn': return autosklearn.metrics.log_loss elif usage == 'catboost': return 'MultiClass' # Effectively LogLoss return 'logloss' else: raise Exception('No scoring string found for metric') def eval_f(params, clf_, x, y, metric_used, start_time, max_time): if time.time() - start_time > max_time: return np.nan scores = cross_val_score(clf_(**params), x, y, cv=CV, scoring=get_scoring_string(metric_used)) return -np.nanmean(scores) def preprocess_impute(x, y, test_x, test_y, impute, one_hot, standardize, cat_features=[]): import warnings def warn(*args, **kwargs): pass warnings.warn = warn x, y, test_x, test_y = x.cpu().numpy(), y.cpu().long().numpy(), test_x.cpu().numpy(), test_y.cpu().long().numpy() if impute: imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_mean.fit(x) x, test_x = imp_mean.transform(x), imp_mean.transform(test_x) if one_hot: def make_pd_from_np(x): data = pd.DataFrame(x) for c in cat_features: data.iloc[:, c] = data.iloc[:, c].astype('int') return data x, test_x = make_pd_from_np(x), make_pd_from_np(test_x) transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_features)], remainder="passthrough") transformer.fit(x) x, test_x = transformer.transform(x), transformer.transform(test_x) if standardize: scaler = MinMaxScaler() scaler.fit(x) x, test_x = scaler.transform(x), scaler.transform(test_x) return x, y, test_x, test_y ## Auto Gluon def autogluon_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): from autogluon.tabular import TabularPredictor # Inside function so package can be sued without installation x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y , one_hot=False , cat_features=cat_features , impute=False , standardize=False) train_data = pd.DataFrame(np.concatenate([x, y[:, np.newaxis]], 1)) test_data = pd.DataFrame(np.concatenate([test_x, test_y[:, np.newaxis]], 1)) # AutoGluon automatically infers datatypes, we don't specify the categorical labels predictor = TabularPredictor( label=train_data.columns[-1], eval_metric=get_scoring_string(metric_used, usage='autogluon', multiclass=(len(np.unique(y)) > 2)), problem_type='multiclass' if len(np.unique(y)) > 2 else 'binary' ## seed=int(y[:].sum()) doesn't accept seed ).fit( train_data=train_data, time_limit=max_time, presets=['best_quality'] # The seed is deterministic but varies for each dataset and each split of it ) pred = predictor.predict_proba(test_data, as_multiclass=True).values metric = metric_used(test_y, pred) return metric, pred, predictor.fit_summary() ## AUTO Sklearn def autosklearn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): return autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=max_time, version=1) from autosklearn.experimental.askl2 import AutoSklearn2Classifier from autosklearn.classification import AutoSklearnClassifier def autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300, version=2): x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y , one_hot=False , cat_features=cat_features , impute=False , standardize=False) def make_pd_from_np(x): data = pd.DataFrame(x) for c in cat_features: data.iloc[:, c] = data.iloc[:, c].astype('category') return data x = make_pd_from_np(x) test_x = make_pd_from_np(test_x) clf_ = AutoSklearn2Classifier if version == 2 else AutoSklearnClassifier clf = clf_(time_left_for_this_task=max_time, memory_limit=4000, n_jobs=MULTITHREAD, seed=int(y[:].sum()), # The seed is deterministic but varies for each dataset and each split of it metric=get_scoring_string(metric_used, usage='autosklearn', multiclass=len(np.unique(y)) > 2)) # fit model to data clf.fit(x, y) pred = clf.predict_proba(test_x) metric = metric_used(test_y, pred) return metric, pred, None param_grid_hyperopt['logistic'] = { 'penalty': hp.choice('penalty', ['l1', 'l2', 'none']) , 'max_iter': hp.randint('max_iter', [50, 500]) , 'fit_intercept': hp.choice('fit_intercept', [True, False]) , 'C': hp.loguniform('C', -5, math.log(5.0))} # 'normalize': [False], def logistic_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y , one_hot=True, impute=True, standardize=True , cat_features=cat_features) def clf_(**params): return LogisticRegression(solver='saga', tol=1e-4, n_jobs=1, **params) start_time = time.time() def stop(trial): return time.time() - start_time > max_time, [] best = fmin( fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), space=param_grid_hyperopt['logistic'], algo=rand.suggest, rstate=np.random.RandomState(int(y[:].sum())), early_stop_fn=stop, # The seed is deterministic but varies for each dataset and each split of it max_evals=10000) best = space_eval(param_grid_hyperopt['logistic'], best) clf = clf_(**best) clf.fit(x, y) pred = clf.predict_proba(test_x) metric = metric_used(test_y, pred) return metric, pred, best ## KNN param_grid_hyperopt['knn'] = {'n_neighbors': hp.randint('n_neighbors', 1,16) } def knn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y, one_hot=True, impute=True, standardize=True, cat_features=cat_features) def clf_(**params): return neighbors.KNeighborsClassifier(n_jobs=1, **params) start_time = time.time() def stop(trial): return time.time() - start_time > max_time, [] best = fmin( fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), space=param_grid_hyperopt['knn'], algo=rand.suggest, rstate=np.random.RandomState(int(y[:].sum())), early_stop_fn=stop, # The seed is deterministic but varies for each dataset and each split of it max_evals=10000) best = space_eval(param_grid_hyperopt['knn'], best) clf = clf_(**best) clf.fit(x, y) pred = clf.predict_proba(test_x) metric = metric_used(test_y, pred) return metric, pred, best ## GP param_grid_hyperopt['gp'] = { 'params_y_scale': hp.loguniform('params_y_scale', math.log(0.05), math.log(5.0)), 'params_length_scale': hp.loguniform('params_length_scale', math.log(0.1), math.log(1.0)), 'n_jobs': hp.choice('njobs', [1]) } def gp_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y, one_hot=True, impute=True, standardize=True, cat_features=cat_features) def clf_(params_y_scale,params_length_scale, **params): return GaussianProcessClassifier(kernel= params_y_scale * RBF(params_length_scale), **params) start_time = time.time() def stop(trial): return time.time() - start_time > max_time, [] best = fmin( fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), space=param_grid_hyperopt['gp'], algo=rand.suggest, rstate=np.random.RandomState(int(y[:].sum())), early_stop_fn=stop, # The seed is deterministic but varies for each dataset and each split of it max_evals=1000) best = space_eval(param_grid_hyperopt['gp'], best) clf = clf_(**best) clf.fit(x, y) pred = clf.predict_proba(test_x) metric = metric_used(test_y, pred) return metric, pred, best # Catboost # Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf param_grid_hyperopt['catboost'] = { 'learning_rate': hp.loguniform('learning_rate', math.log(math.pow(math.e, -5)), math.log(1)), 'random_strength': hp.randint('random_strength', 1, 20), 'l2_leaf_reg': hp.loguniform('l2_leaf_reg', math.log(1), math.log(10)), 'bagging_temperature': hp.uniform('bagging_temperature', 0., 1), 'leaf_estimation_iterations': hp.randint('leaf_estimation_iterations', 1, 20), 'iterations': hp.randint('iterations', 100, 4000), # This is smaller than in paper, 4000 leads to ram overusage } def catboost_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): print(x) x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y , one_hot=False , cat_features=cat_features , impute=False , standardize=False) # Nans in categorical features must be encoded as separate class x[:, cat_features], test_x[:, cat_features] = np.nan_to_num(x[:, cat_features], -1), np.nan_to_num( test_x[:, cat_features], -1) def make_pd_from_np(x): data = pd.DataFrame(x) for c in cat_features: data.iloc[:, c] = data.iloc[:, c].astype('int') return data x = make_pd_from_np(x) test_x = make_pd_from_np(test_x) def clf_(**params): return CatBoostClassifier( loss_function=get_scoring_string(metric_used, usage='catboost'), thread_count = MULTITHREAD, used_ram_limit='4gb', random_seed=int(y[:].sum()), logging_level='Silent', cat_features=cat_features, **params) start_time = time.time() def stop(trial): return time.time() - start_time > max_time, [] best = fmin( fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), space=param_grid_hyperopt['catboost'], algo=rand.suggest, rstate=np.random.RandomState(int(y[:].sum())), early_stop_fn=stop, # The seed is deterministic but varies for each dataset and each split of it max_evals=1000) best = space_eval(param_grid_hyperopt['catboost'], best) clf = clf_(**best) clf.fit(x, y) pred = clf.predict_proba(test_x) metric = metric_used(test_y, pred) return metric, pred, best # XGBoost # Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf param_grid_hyperopt['xgb'] = { 'learning_rate': hp.loguniform('learning_rate', -7, math.log(1)), 'max_depth': hp.randint('max_depth', 1, 10), 'subsample': hp.uniform('subsample', 0.2, 1), 'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1), 'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1), 'min_child_weight': hp.loguniform('min_child_weight', -16, 5), 'alpha': hp.loguniform('alpha', -16, 2), 'lambda': hp.loguniform('lambda', -16, 2), 'gamma': hp.loguniform('gamma', -16, 2), 'n_estimators': hp.randint('n_estimators', 100, 4000), # This is smaller than in paper } def xgb_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): # XGB Documentation: # XGB handles categorical data appropriately without using One Hot Encoding, categorical features are experimetal # XGB handles missing values appropriately without imputation x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y , one_hot=False , cat_features=cat_features , impute=False , standardize=False) def clf_(**params): return xgb.XGBClassifier(use_label_encoder=False , nthread=1 , **params , eval_metric=get_scoring_string(metric_used, usage='xgb') # AUC not implemented ) start_time = time.time() def stop(trial): return time.time() - start_time > max_time, [] best = fmin( fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), space=param_grid_hyperopt['xgb'], algo=rand.suggest, rstate=np.random.RandomState(int(y[:].sum())), early_stop_fn=stop, # The seed is deterministic but varies for each dataset and each split of it max_evals=1000) best = space_eval(param_grid_hyperopt['xgb'], best) clf = clf_(**best) clf.fit(x, y) pred = clf.predict_proba(test_x) metric = metric_used(test_y, pred) return metric, pred, best clf_dict = {'gp': gp_metric , 'knn': knn_metric , 'catboost': catboost_metric , 'xgb': xgb_metric , 'logistic': logistic_metric , 'autosklearn': autosklearn_metric , 'autosklearn2': autosklearn2_metric , 'autogluon': autogluon_metric}