Spaces:
Running
Running
"""Functions for doing feature selection during preprocessing.""" | |
from typing import Optional, cast | |
import numpy as np | |
from numpy import ndarray | |
from numpy.typing import NDArray | |
from .utils import ArrayLike | |
def run_feature_selection( | |
X: ndarray, | |
y: ndarray, | |
select_k_features: int, | |
random_state: Optional[np.random.RandomState] = None, | |
) -> NDArray[np.bool_]: | |
""" | |
Find most important features. | |
Uses a gradient boosting tree regressor as a proxy for finding | |
the k most important features in X, returning indices for those | |
features as output. | |
""" | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.feature_selection import SelectFromModel | |
clf = RandomForestRegressor( | |
n_estimators=100, max_depth=3, random_state=random_state | |
) | |
clf.fit(X, y) | |
selector = SelectFromModel( | |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True | |
) | |
return cast(NDArray[np.bool_], selector.get_support(indices=False)) | |
# Function has not been removed only due to usage in module tests | |
def _handle_feature_selection( | |
X: ndarray, | |
select_k_features: Optional[int], | |
y: ndarray, | |
variable_names: ArrayLike[str], | |
): | |
if select_k_features is not None: | |
selection = run_feature_selection(X, y, select_k_features) | |
print(f"Using features {[variable_names[i] for i in selection]}") | |
X = X[:, selection] | |
else: | |
selection = None | |
return X, selection | |