|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.metrics import roc_auc_score |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0): |
|
|
|
|
|
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, |
|
|
random_state=random_state,class_weight=class_weight, |
|
|
n_jobs=-1) |
|
|
model.fit(X_train, y_train) |
|
|
train_auc = roc_auc_score(y_train, (model.predict_proba(X_train))[:, 1]) |
|
|
feature_dict = {} |
|
|
|
|
|
|
|
|
for feature in X_train.columns: |
|
|
X_train_c = X_train.copy().reset_index(drop=True) |
|
|
y_train_c = y_train.copy().reset_index(drop=True) |
|
|
|
|
|
|
|
|
X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index( |
|
|
drop=True) |
|
|
|
|
|
|
|
|
shuff_auc = roc_auc_score(y_train_c, |
|
|
(model.predict_proba(X_train_c))[:, 1]) |
|
|
|
|
|
|
|
|
feature_dict[feature] = (train_auc - shuff_auc) |
|
|
|
|
|
|
|
|
auc_drop = pd.Series(feature_dict).reset_index() |
|
|
auc_drop.columns = ['feature', 'auc_drop'] |
|
|
auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True) |
|
|
selected_features = auc_drop[auc_drop.auc_drop>0]['feature'] |
|
|
|
|
|
return auc_drop, selected_features |
|
|
|
|
|
|