Spaces:
Running
Running
MilesCranmer
commited on
Commit
•
c822df8
1
Parent(s):
12e6d5e
Move feature selection functionality to separate file
Browse files- pysr/feature_selection.py +35 -0
- pysr/sr.py +1 -33
- pysr/test/test.py +3 -8
pysr/feature_selection.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Functions for doing feature selection during preprocessing."""
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def run_feature_selection(X, y, select_k_features, random_state=None) -> np.ndarray:
|
6 |
+
"""
|
7 |
+
Find most important features.
|
8 |
+
|
9 |
+
Uses a gradient boosting tree regressor as a proxy for finding
|
10 |
+
the k most important features in X, returning indices for those
|
11 |
+
features as output.
|
12 |
+
"""
|
13 |
+
from sklearn.ensemble import RandomForestRegressor
|
14 |
+
from sklearn.feature_selection import SelectFromModel
|
15 |
+
|
16 |
+
clf = RandomForestRegressor(
|
17 |
+
n_estimators=100, max_depth=3, random_state=random_state
|
18 |
+
)
|
19 |
+
clf.fit(X, y)
|
20 |
+
selector = SelectFromModel(
|
21 |
+
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
22 |
+
)
|
23 |
+
return selector.get_support(indices=True)
|
24 |
+
|
25 |
+
|
26 |
+
# Function has not been removed only due to usage in module tests
|
27 |
+
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
28 |
+
if select_k_features is not None:
|
29 |
+
selection = run_feature_selection(X, y, select_k_features)
|
30 |
+
print(f"Using features {[variable_names[i] for i in selection]}")
|
31 |
+
X = X[:, selection]
|
32 |
+
else:
|
33 |
+
selection = None
|
34 |
+
|
35 |
+
return X, selection
|
pysr/sr.py
CHANGED
@@ -25,6 +25,7 @@ from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
|
|
25 |
from .export_numpy import sympy2numpy
|
26 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
27 |
from .export_torch import sympy2torch
|
|
|
28 |
from .julia_helpers import (
|
29 |
_escape_filename,
|
30 |
_load_backend,
|
@@ -2385,36 +2386,3 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
|
2385 |
f"{model_selection} is not a valid model selection strategy."
|
2386 |
)
|
2387 |
return chosen_idx
|
2388 |
-
|
2389 |
-
|
2390 |
-
# Function has not been removed only due to usage in module tests
|
2391 |
-
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
2392 |
-
if select_k_features is not None:
|
2393 |
-
selection = run_feature_selection(X, y, select_k_features)
|
2394 |
-
print(f"Using features {[variable_names[i] for i in selection]}")
|
2395 |
-
X = X[:, selection]
|
2396 |
-
|
2397 |
-
else:
|
2398 |
-
selection = None
|
2399 |
-
return X, selection
|
2400 |
-
|
2401 |
-
|
2402 |
-
def run_feature_selection(X, y, select_k_features, random_state=None):
|
2403 |
-
"""
|
2404 |
-
Find most important features.
|
2405 |
-
|
2406 |
-
Uses a gradient boosting tree regressor as a proxy for finding
|
2407 |
-
the k most important features in X, returning indices for those
|
2408 |
-
features as output.
|
2409 |
-
"""
|
2410 |
-
from sklearn.ensemble import RandomForestRegressor
|
2411 |
-
from sklearn.feature_selection import SelectFromModel
|
2412 |
-
|
2413 |
-
clf = RandomForestRegressor(
|
2414 |
-
n_estimators=100, max_depth=3, random_state=random_state
|
2415 |
-
)
|
2416 |
-
clf.fit(X, y)
|
2417 |
-
selector = SelectFromModel(
|
2418 |
-
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
2419 |
-
)
|
2420 |
-
return selector.get_support(indices=True)
|
|
|
25 |
from .export_numpy import sympy2numpy
|
26 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
27 |
from .export_torch import sympy2torch
|
28 |
+
from .feature_selection import run_feature_selection
|
29 |
from .julia_helpers import (
|
30 |
_escape_filename,
|
31 |
_load_backend,
|
|
|
2386 |
f"{model_selection} is not a valid model selection strategy."
|
2387 |
)
|
2388 |
return chosen_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pysr/test/test.py
CHANGED
@@ -14,14 +14,9 @@ from sklearn.utils.estimator_checks import check_estimator
|
|
14 |
|
15 |
from .. import PySRRegressor, julia_helpers
|
16 |
from ..export_latex import sympy2latex
|
17 |
-
from ..
|
18 |
-
|
19 |
-
|
20 |
-
_handle_feature_selection,
|
21 |
-
_process_constraints,
|
22 |
-
idx_model_selection,
|
23 |
-
run_feature_selection,
|
24 |
-
)
|
25 |
|
26 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
27 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
|
|
14 |
|
15 |
from .. import PySRRegressor, julia_helpers
|
16 |
from ..export_latex import sympy2latex
|
17 |
+
from ..feature_selection import _handle_feature_selection, run_feature_selection
|
18 |
+
from ..sr import _check_assertions, _process_constraints, idx_model_selection
|
19 |
+
from ..utils import _csv_filename_to_pkl_filename
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
22 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|