Spaces:
Running
Running
MilesCranmer
commited on
Merge pull request #428 from MilesCranmer/refactor-utils
Browse files- .github/workflows/CI.yml +26 -0
- mypy.ini +8 -0
- pysr/__init__.py +2 -1
- pysr/denoising.py +35 -0
- pysr/deprecated.py +54 -0
- pysr/export_latex.py +13 -11
- pysr/export_sympy.py +2 -2
- pysr/feature_selection.py +35 -0
- pysr/feynman_problems.py +1 -1
- pysr/sr.py +19 -159
- pysr/test/test.py +3 -8
- pysr/utils.py +55 -0
.github/workflows/CI.yml
CHANGED
@@ -143,3 +143,29 @@ jobs:
|
|
143 |
run: |
|
144 |
pip install coveralls
|
145 |
coveralls --finish
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
run: |
|
144 |
pip install coveralls
|
145 |
coveralls --finish
|
146 |
+
|
147 |
+
types:
|
148 |
+
name: Check types
|
149 |
+
runs-on: ubuntu-latest
|
150 |
+
defaults:
|
151 |
+
run:
|
152 |
+
shell: bash -l {0}
|
153 |
+
strategy:
|
154 |
+
matrix:
|
155 |
+
python-version: ['3.10']
|
156 |
+
|
157 |
+
steps:
|
158 |
+
- uses: actions/checkout@v3
|
159 |
+
- name: "Set up Python"
|
160 |
+
uses: actions/setup-python@v4
|
161 |
+
with:
|
162 |
+
python-version: ${{ matrix.python-version }}
|
163 |
+
cache: pip
|
164 |
+
- name: "Install PySR and all dependencies"
|
165 |
+
run: |
|
166 |
+
python -m pip install --upgrade pip
|
167 |
+
pip install -r requirements.txt
|
168 |
+
pip install mypy jax jaxlib torch
|
169 |
+
python setup.py install
|
170 |
+
- name: "Run mypy"
|
171 |
+
run: mypy --install-types --non-interactive pysr
|
mypy.ini
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[mypy]
|
2 |
+
warn_return_any = True
|
3 |
+
|
4 |
+
[mypy-sklearn.*]
|
5 |
+
ignore_missing_imports = True
|
6 |
+
|
7 |
+
[mypy-julia.*]
|
8 |
+
ignore_missing_imports = True
|
pysr/__init__.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
from . import sklearn_monkeypatch
|
|
|
2 |
from .export_jax import sympy2jax
|
3 |
from .export_torch import sympy2torch
|
4 |
from .feynman_problems import FeynmanProblem, Problem
|
5 |
from .julia_helpers import install
|
6 |
-
from .sr import PySRRegressor
|
7 |
from .version import __version__
|
8 |
|
9 |
__all__ = [
|
|
|
1 |
from . import sklearn_monkeypatch
|
2 |
+
from .deprecated import best, best_callable, best_row, best_tex, pysr
|
3 |
from .export_jax import sympy2jax
|
4 |
from .export_torch import sympy2torch
|
5 |
from .feynman_problems import FeynmanProblem, Problem
|
6 |
from .julia_helpers import install
|
7 |
+
from .sr import PySRRegressor
|
8 |
from .version import __version__
|
9 |
|
10 |
__all__ = [
|
pysr/denoising.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Functions for denoising data during preprocessing."""
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def denoise(X, y, Xresampled=None, random_state=None):
|
6 |
+
"""Denoise the dataset using a Gaussian process."""
|
7 |
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
8 |
+
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
9 |
+
|
10 |
+
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
11 |
+
gpr = GaussianProcessRegressor(
|
12 |
+
kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
|
13 |
+
)
|
14 |
+
gpr.fit(X, y)
|
15 |
+
|
16 |
+
if Xresampled is not None:
|
17 |
+
return Xresampled, gpr.predict(Xresampled)
|
18 |
+
|
19 |
+
return X, gpr.predict(X)
|
20 |
+
|
21 |
+
|
22 |
+
def multi_denoise(X, y, Xresampled=None, random_state=None):
|
23 |
+
"""Perform `denoise` along each column of `y` independently."""
|
24 |
+
y = np.stack(
|
25 |
+
[
|
26 |
+
denoise(X, y[:, i], Xresampled=Xresampled, random_state=random_state)[1]
|
27 |
+
for i in range(y.shape[1])
|
28 |
+
],
|
29 |
+
axis=1,
|
30 |
+
)
|
31 |
+
|
32 |
+
if Xresampled is not None:
|
33 |
+
return Xresampled, y
|
34 |
+
|
35 |
+
return X, y
|
pysr/deprecated.py
CHANGED
@@ -1,4 +1,58 @@
|
|
1 |
"""Various functions to deprecate features."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
def make_deprecated_kwargs_for_pysr_regressor():
|
|
|
1 |
"""Various functions to deprecate features."""
|
2 |
+
import warnings
|
3 |
+
|
4 |
+
|
5 |
+
def pysr(X, y, weights=None, **kwargs): # pragma: no cover
|
6 |
+
from .sr import PySRRegressor
|
7 |
+
|
8 |
+
warnings.warn(
|
9 |
+
"Calling `pysr` is deprecated. "
|
10 |
+
"Please use `model = PySRRegressor(**params); "
|
11 |
+
"model.fit(X, y)` going forward.",
|
12 |
+
FutureWarning,
|
13 |
+
)
|
14 |
+
model = PySRRegressor(**kwargs)
|
15 |
+
model.fit(X, y, weights=weights)
|
16 |
+
return model.equations_
|
17 |
+
|
18 |
+
|
19 |
+
def best(*args, **kwargs): # pragma: no cover
|
20 |
+
raise NotImplementedError(
|
21 |
+
"`best` has been deprecated. "
|
22 |
+
"Please use the `PySRRegressor` interface. "
|
23 |
+
"After fitting, you can return `.sympy()` "
|
24 |
+
"to get the sympy representation "
|
25 |
+
"of the best equation."
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
def best_row(*args, **kwargs): # pragma: no cover
|
30 |
+
raise NotImplementedError(
|
31 |
+
"`best_row` has been deprecated. "
|
32 |
+
"Please use the `PySRRegressor` interface. "
|
33 |
+
"After fitting, you can run `print(model)` to view the best equation, "
|
34 |
+
"or "
|
35 |
+
"`model.get_best()` to return the best equation's "
|
36 |
+
"row in `model.equations_`."
|
37 |
+
)
|
38 |
+
|
39 |
+
|
40 |
+
def best_tex(*args, **kwargs): # pragma: no cover
|
41 |
+
raise NotImplementedError(
|
42 |
+
"`best_tex` has been deprecated. "
|
43 |
+
"Please use the `PySRRegressor` interface. "
|
44 |
+
"After fitting, you can return `.latex()` to "
|
45 |
+
"get the sympy representation "
|
46 |
+
"of the best equation."
|
47 |
+
)
|
48 |
+
|
49 |
+
|
50 |
+
def best_callable(*args, **kwargs): # pragma: no cover
|
51 |
+
raise NotImplementedError(
|
52 |
+
"`best_callable` has been deprecated. Please use the `PySRRegressor` "
|
53 |
+
"interface. After fitting, you can use "
|
54 |
+
"`.predict(X)` to use the best callable."
|
55 |
+
)
|
56 |
|
57 |
|
58 |
def make_deprecated_kwargs_for_pysr_regressor():
|
pysr/export_latex.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""Functions to help export PySR equations to LaTeX."""
|
2 |
-
from typing import List
|
3 |
|
4 |
import pandas as pd
|
5 |
import sympy
|
@@ -19,14 +19,16 @@ class PreciseLatexPrinter(LatexPrinter):
|
|
19 |
return super()._print_Float(reduced_float)
|
20 |
|
21 |
|
22 |
-
def sympy2latex(expr, prec=3, full_prec=True, **settings):
|
23 |
"""Convert sympy expression to LaTeX with custom precision."""
|
24 |
settings["full_prec"] = full_prec
|
25 |
printer = PreciseLatexPrinter(settings=settings, prec=prec)
|
26 |
return printer.doprint(expr)
|
27 |
|
28 |
|
29 |
-
def generate_table_environment(
|
|
|
|
|
30 |
margins = "c" * len(columns)
|
31 |
column_map = {
|
32 |
"complexity": "Complexity",
|
@@ -58,12 +60,12 @@ def generate_table_environment(columns=["equation", "complexity", "loss"]):
|
|
58 |
|
59 |
def sympy2latextable(
|
60 |
equations: pd.DataFrame,
|
61 |
-
indices: List[int] = None,
|
62 |
precision: int = 3,
|
63 |
-
columns=["equation", "complexity", "loss", "score"],
|
64 |
max_equation_length: int = 50,
|
65 |
output_variable_name: str = "y",
|
66 |
-
):
|
67 |
"""Generate a booktabs-style LaTeX table for a single set of equations."""
|
68 |
assert isinstance(equations, pd.DataFrame)
|
69 |
|
@@ -71,7 +73,7 @@ def sympy2latextable(
|
|
71 |
latex_table_content = []
|
72 |
|
73 |
if indices is None:
|
74 |
-
indices =
|
75 |
|
76 |
for i in indices:
|
77 |
latex_equation = sympy2latex(
|
@@ -126,11 +128,11 @@ def sympy2latextable(
|
|
126 |
|
127 |
def sympy2multilatextable(
|
128 |
equations: List[pd.DataFrame],
|
129 |
-
indices: List[List[int]] = None,
|
130 |
precision: int = 3,
|
131 |
-
columns=["equation", "complexity", "loss", "score"],
|
132 |
-
output_variable_names: str = None,
|
133 |
-
):
|
134 |
"""Generate multiple latex tables for a list of equation sets."""
|
135 |
# TODO: Let user specify custom output variable
|
136 |
|
|
|
1 |
"""Functions to help export PySR equations to LaTeX."""
|
2 |
+
from typing import List, Optional, Tuple
|
3 |
|
4 |
import pandas as pd
|
5 |
import sympy
|
|
|
19 |
return super()._print_Float(reduced_float)
|
20 |
|
21 |
|
22 |
+
def sympy2latex(expr, prec=3, full_prec=True, **settings) -> str:
|
23 |
"""Convert sympy expression to LaTeX with custom precision."""
|
24 |
settings["full_prec"] = full_prec
|
25 |
printer = PreciseLatexPrinter(settings=settings, prec=prec)
|
26 |
return printer.doprint(expr)
|
27 |
|
28 |
|
29 |
+
def generate_table_environment(
|
30 |
+
columns: List[str] = ["equation", "complexity", "loss"]
|
31 |
+
) -> Tuple[str, str]:
|
32 |
margins = "c" * len(columns)
|
33 |
column_map = {
|
34 |
"complexity": "Complexity",
|
|
|
60 |
|
61 |
def sympy2latextable(
|
62 |
equations: pd.DataFrame,
|
63 |
+
indices: Optional[List[int]] = None,
|
64 |
precision: int = 3,
|
65 |
+
columns: List[str] = ["equation", "complexity", "loss", "score"],
|
66 |
max_equation_length: int = 50,
|
67 |
output_variable_name: str = "y",
|
68 |
+
) -> str:
|
69 |
"""Generate a booktabs-style LaTeX table for a single set of equations."""
|
70 |
assert isinstance(equations, pd.DataFrame)
|
71 |
|
|
|
73 |
latex_table_content = []
|
74 |
|
75 |
if indices is None:
|
76 |
+
indices = list(equations.index)
|
77 |
|
78 |
for i in indices:
|
79 |
latex_equation = sympy2latex(
|
|
|
128 |
|
129 |
def sympy2multilatextable(
|
130 |
equations: List[pd.DataFrame],
|
131 |
+
indices: Optional[List[List[int]]] = None,
|
132 |
precision: int = 3,
|
133 |
+
columns: List[str] = ["equation", "complexity", "loss", "score"],
|
134 |
+
output_variable_names: Optional[List[str]] = None,
|
135 |
+
) -> str:
|
136 |
"""Generate multiple latex tables for a list of equation sets."""
|
137 |
# TODO: Let user specify custom output variable
|
138 |
|
pysr/export_sympy.py
CHANGED
@@ -51,14 +51,14 @@ sympy_mappings = {
|
|
51 |
|
52 |
|
53 |
def create_sympy_symbols(
|
54 |
-
feature_names_in:
|
55 |
) -> List[sympy.Symbol]:
|
56 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
57 |
|
58 |
|
59 |
def pysr2sympy(
|
60 |
equation: str, *, extra_sympy_mappings: Optional[Dict[str, Callable]] = None
|
61 |
-
)
|
62 |
local_sympy_mappings = {
|
63 |
**(extra_sympy_mappings if extra_sympy_mappings else {}),
|
64 |
**sympy_mappings,
|
|
|
51 |
|
52 |
|
53 |
def create_sympy_symbols(
|
54 |
+
feature_names_in: List[str],
|
55 |
) -> List[sympy.Symbol]:
|
56 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
57 |
|
58 |
|
59 |
def pysr2sympy(
|
60 |
equation: str, *, extra_sympy_mappings: Optional[Dict[str, Callable]] = None
|
61 |
+
):
|
62 |
local_sympy_mappings = {
|
63 |
**(extra_sympy_mappings if extra_sympy_mappings else {}),
|
64 |
**sympy_mappings,
|
pysr/feature_selection.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Functions for doing feature selection during preprocessing."""
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def run_feature_selection(X, y, select_k_features, random_state=None):
|
6 |
+
"""
|
7 |
+
Find most important features.
|
8 |
+
|
9 |
+
Uses a gradient boosting tree regressor as a proxy for finding
|
10 |
+
the k most important features in X, returning indices for those
|
11 |
+
features as output.
|
12 |
+
"""
|
13 |
+
from sklearn.ensemble import RandomForestRegressor
|
14 |
+
from sklearn.feature_selection import SelectFromModel
|
15 |
+
|
16 |
+
clf = RandomForestRegressor(
|
17 |
+
n_estimators=100, max_depth=3, random_state=random_state
|
18 |
+
)
|
19 |
+
clf.fit(X, y)
|
20 |
+
selector = SelectFromModel(
|
21 |
+
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
22 |
+
)
|
23 |
+
return selector.get_support(indices=True)
|
24 |
+
|
25 |
+
|
26 |
+
# Function has not been removed only due to usage in module tests
|
27 |
+
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
28 |
+
if select_k_features is not None:
|
29 |
+
selection = run_feature_selection(X, y, select_k_features)
|
30 |
+
print(f"Using features {[variable_names[i] for i in selection]}")
|
31 |
+
X = X[:, selection]
|
32 |
+
else:
|
33 |
+
selection = None
|
34 |
+
|
35 |
+
return X, selection
|
pysr/feynman_problems.py
CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
4 |
|
5 |
import numpy as np
|
6 |
|
7 |
-
from .
|
8 |
|
9 |
PKG_DIR = Path(__file__).parents[1]
|
10 |
FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
|
|
|
4 |
|
5 |
import numpy as np
|
6 |
|
7 |
+
from .deprecated import best, pysr
|
8 |
|
9 |
PKG_DIR = Path(__file__).parents[1]
|
10 |
FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
|
pysr/sr.py
CHANGED
@@ -11,6 +11,7 @@ from datetime import datetime
|
|
11 |
from io import StringIO
|
12 |
from multiprocessing import cpu_count
|
13 |
from pathlib import Path
|
|
|
14 |
|
15 |
import numpy as np
|
16 |
import pandas as pd
|
@@ -18,12 +19,14 @@ from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
|
18 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
19 |
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
|
20 |
|
|
|
21 |
from .deprecated import make_deprecated_kwargs_for_pysr_regressor
|
22 |
from .export_jax import sympy2jax
|
23 |
from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
|
24 |
from .export_numpy import sympy2numpy
|
25 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
26 |
from .export_torch import sympy2torch
|
|
|
27 |
from .julia_helpers import (
|
28 |
_escape_filename,
|
29 |
_load_backend,
|
@@ -33,23 +36,18 @@ from .julia_helpers import (
|
|
33 |
init_julia,
|
34 |
is_julia_version_greater_eq,
|
35 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
Main = None # TODO: Rename to more descriptive name like "julia_runtime"
|
38 |
|
39 |
already_ran = False
|
40 |
|
41 |
|
42 |
-
def pysr(X, y, weights=None, **kwargs): # pragma: no cover
|
43 |
-
warnings.warn(
|
44 |
-
"Calling `pysr` is deprecated. "
|
45 |
-
"Please use `model = PySRRegressor(**params); model.fit(X, y)` going forward.",
|
46 |
-
FutureWarning,
|
47 |
-
)
|
48 |
-
model = PySRRegressor(**kwargs)
|
49 |
-
model.fit(X, y, weights=weights)
|
50 |
-
return model.equations_
|
51 |
-
|
52 |
-
|
53 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
54 |
constraints = constraints.copy()
|
55 |
for op in unary_operators:
|
@@ -172,37 +170,6 @@ def _check_assertions(
|
|
172 |
)
|
173 |
|
174 |
|
175 |
-
def best(*args, **kwargs): # pragma: no cover
|
176 |
-
raise NotImplementedError(
|
177 |
-
"`best` has been deprecated. Please use the `PySRRegressor` interface. "
|
178 |
-
"After fitting, you can return `.sympy()` to get the sympy representation "
|
179 |
-
"of the best equation."
|
180 |
-
)
|
181 |
-
|
182 |
-
|
183 |
-
def best_row(*args, **kwargs): # pragma: no cover
|
184 |
-
raise NotImplementedError(
|
185 |
-
"`best_row` has been deprecated. Please use the `PySRRegressor` interface. "
|
186 |
-
"After fitting, you can run `print(model)` to view the best equation, or "
|
187 |
-
"`model.get_best()` to return the best equation's row in `model.equations_`."
|
188 |
-
)
|
189 |
-
|
190 |
-
|
191 |
-
def best_tex(*args, **kwargs): # pragma: no cover
|
192 |
-
raise NotImplementedError(
|
193 |
-
"`best_tex` has been deprecated. Please use the `PySRRegressor` interface. "
|
194 |
-
"After fitting, you can return `.latex()` to get the sympy representation "
|
195 |
-
"of the best equation."
|
196 |
-
)
|
197 |
-
|
198 |
-
|
199 |
-
def best_callable(*args, **kwargs): # pragma: no cover
|
200 |
-
raise NotImplementedError(
|
201 |
-
"`best_callable` has been deprecated. Please use the `PySRRegressor` "
|
202 |
-
"interface. After fitting, you can use `.predict(X)` to use the best callable."
|
203 |
-
)
|
204 |
-
|
205 |
-
|
206 |
# Class validation constants
|
207 |
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
208 |
|
@@ -945,10 +912,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
945 |
model : PySRRegressor
|
946 |
The model with fitted equations.
|
947 |
"""
|
948 |
-
|
949 |
-
|
950 |
-
else:
|
951 |
-
pkl_filename = equation_file
|
952 |
|
953 |
# Try to load model from <equation_file>.pkl
|
954 |
print(f"Checking if {pkl_filename} exists...")
|
@@ -1502,19 +1467,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1502 |
# Denoising transformation
|
1503 |
if self.denoise:
|
1504 |
if self.nout_ > 1:
|
1505 |
-
y =
|
1506 |
-
|
1507 |
-
_denoise(
|
1508 |
-
X, y[:, i], Xresampled=Xresampled, random_state=random_state
|
1509 |
-
)[1]
|
1510 |
-
for i in range(self.nout_)
|
1511 |
-
],
|
1512 |
-
axis=1,
|
1513 |
)
|
1514 |
-
if Xresampled is not None:
|
1515 |
-
X = Xresampled
|
1516 |
else:
|
1517 |
-
X, y =
|
1518 |
|
1519 |
return X, y, variable_names, X_units, y_units
|
1520 |
|
@@ -1783,10 +1740,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1783 |
y,
|
1784 |
Xresampled=None,
|
1785 |
weights=None,
|
1786 |
-
variable_names=None,
|
1787 |
-
X_units=None,
|
1788 |
-
y_units=None,
|
1789 |
-
):
|
1790 |
"""
|
1791 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
1792 |
|
@@ -2373,7 +2330,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2373 |
return "\n".join(preamble_string + [table_string])
|
2374 |
|
2375 |
|
2376 |
-
def idx_model_selection(equations: pd.DataFrame, model_selection: str)
|
2377 |
"""Select an expression and return its index."""
|
2378 |
if model_selection == "accuracy":
|
2379 |
chosen_idx = equations["loss"].idxmin()
|
@@ -2388,100 +2345,3 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
|
2388 |
f"{model_selection} is not a valid model selection strategy."
|
2389 |
)
|
2390 |
return chosen_idx
|
2391 |
-
|
2392 |
-
|
2393 |
-
def _denoise(X, y, Xresampled=None, random_state=None):
|
2394 |
-
"""Denoise the dataset using a Gaussian process."""
|
2395 |
-
from sklearn.gaussian_process import GaussianProcessRegressor
|
2396 |
-
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
2397 |
-
|
2398 |
-
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
2399 |
-
gpr = GaussianProcessRegressor(
|
2400 |
-
kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
|
2401 |
-
)
|
2402 |
-
gpr.fit(X, y)
|
2403 |
-
if Xresampled is not None:
|
2404 |
-
return Xresampled, gpr.predict(Xresampled)
|
2405 |
-
|
2406 |
-
return X, gpr.predict(X)
|
2407 |
-
|
2408 |
-
|
2409 |
-
# Function has not been removed only due to usage in module tests
|
2410 |
-
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
2411 |
-
if select_k_features is not None:
|
2412 |
-
selection = run_feature_selection(X, y, select_k_features)
|
2413 |
-
print(f"Using features {[variable_names[i] for i in selection]}")
|
2414 |
-
X = X[:, selection]
|
2415 |
-
|
2416 |
-
else:
|
2417 |
-
selection = None
|
2418 |
-
return X, selection
|
2419 |
-
|
2420 |
-
|
2421 |
-
def run_feature_selection(X, y, select_k_features, random_state=None):
|
2422 |
-
"""
|
2423 |
-
Find most important features.
|
2424 |
-
|
2425 |
-
Uses a gradient boosting tree regressor as a proxy for finding
|
2426 |
-
the k most important features in X, returning indices for those
|
2427 |
-
features as output.
|
2428 |
-
"""
|
2429 |
-
from sklearn.ensemble import RandomForestRegressor
|
2430 |
-
from sklearn.feature_selection import SelectFromModel
|
2431 |
-
|
2432 |
-
clf = RandomForestRegressor(
|
2433 |
-
n_estimators=100, max_depth=3, random_state=random_state
|
2434 |
-
)
|
2435 |
-
clf.fit(X, y)
|
2436 |
-
selector = SelectFromModel(
|
2437 |
-
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
2438 |
-
)
|
2439 |
-
return selector.get_support(indices=True)
|
2440 |
-
|
2441 |
-
|
2442 |
-
def _csv_filename_to_pkl_filename(csv_filename) -> str:
|
2443 |
-
# Assume that the csv filename is of the form "foo.csv"
|
2444 |
-
assert str(csv_filename).endswith(".csv")
|
2445 |
-
|
2446 |
-
dirname = str(os.path.dirname(csv_filename))
|
2447 |
-
basename = str(os.path.basename(csv_filename))
|
2448 |
-
base = str(os.path.splitext(basename)[0])
|
2449 |
-
|
2450 |
-
pkl_basename = base + ".pkl"
|
2451 |
-
|
2452 |
-
return os.path.join(dirname, pkl_basename)
|
2453 |
-
|
2454 |
-
|
2455 |
-
_regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
|
2456 |
-
_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
|
2457 |
-
_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
|
2458 |
-
|
2459 |
-
_apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x)
|
2460 |
-
_apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x)
|
2461 |
-
_apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x)
|
2462 |
-
|
2463 |
-
|
2464 |
-
def _preprocess_julia_floats(s: str) -> str:
|
2465 |
-
if isinstance(s, str):
|
2466 |
-
s = _apply_regexp_im(s)
|
2467 |
-
s = _apply_regexp_im_sci(s)
|
2468 |
-
s = _apply_regexp_sci(s)
|
2469 |
-
return s
|
2470 |
-
|
2471 |
-
|
2472 |
-
def _subscriptify(i: int) -> str:
|
2473 |
-
"""Converts integer to subscript text form.
|
2474 |
-
|
2475 |
-
For example, 123 -> "₁₂₃".
|
2476 |
-
"""
|
2477 |
-
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|
2478 |
-
|
2479 |
-
|
2480 |
-
def _safe_check_feature_names_in(self, variable_names, generate_names=True):
|
2481 |
-
"""_check_feature_names_in with compat for old versions."""
|
2482 |
-
try:
|
2483 |
-
return _check_feature_names_in(
|
2484 |
-
self, variable_names, generate_names=generate_names
|
2485 |
-
)
|
2486 |
-
except TypeError:
|
2487 |
-
return _check_feature_names_in(self, variable_names)
|
|
|
11 |
from io import StringIO
|
12 |
from multiprocessing import cpu_count
|
13 |
from pathlib import Path
|
14 |
+
from typing import List, Optional
|
15 |
|
16 |
import numpy as np
|
17 |
import pandas as pd
|
|
|
19 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
20 |
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
|
21 |
|
22 |
+
from .denoising import denoise, multi_denoise
|
23 |
from .deprecated import make_deprecated_kwargs_for_pysr_regressor
|
24 |
from .export_jax import sympy2jax
|
25 |
from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
|
26 |
from .export_numpy import sympy2numpy
|
27 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
28 |
from .export_torch import sympy2torch
|
29 |
+
from .feature_selection import run_feature_selection
|
30 |
from .julia_helpers import (
|
31 |
_escape_filename,
|
32 |
_load_backend,
|
|
|
36 |
init_julia,
|
37 |
is_julia_version_greater_eq,
|
38 |
)
|
39 |
+
from .utils import (
|
40 |
+
_csv_filename_to_pkl_filename,
|
41 |
+
_preprocess_julia_floats,
|
42 |
+
_safe_check_feature_names_in,
|
43 |
+
_subscriptify,
|
44 |
+
)
|
45 |
|
46 |
Main = None # TODO: Rename to more descriptive name like "julia_runtime"
|
47 |
|
48 |
already_ran = False
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
52 |
constraints = constraints.copy()
|
53 |
for op in unary_operators:
|
|
|
170 |
)
|
171 |
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
# Class validation constants
|
174 |
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
175 |
|
|
|
912 |
model : PySRRegressor
|
913 |
The model with fitted equations.
|
914 |
"""
|
915 |
+
|
916 |
+
pkl_filename = _csv_filename_to_pkl_filename(equation_file)
|
|
|
|
|
917 |
|
918 |
# Try to load model from <equation_file>.pkl
|
919 |
print(f"Checking if {pkl_filename} exists...")
|
|
|
1467 |
# Denoising transformation
|
1468 |
if self.denoise:
|
1469 |
if self.nout_ > 1:
|
1470 |
+
X, y = multi_denoise(
|
1471 |
+
X, y, Xresampled=Xresampled, random_state=random_state
|
|
|
|
|
|
|
|
|
|
|
|
|
1472 |
)
|
|
|
|
|
1473 |
else:
|
1474 |
+
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1475 |
|
1476 |
return X, y, variable_names, X_units, y_units
|
1477 |
|
|
|
1740 |
y,
|
1741 |
Xresampled=None,
|
1742 |
weights=None,
|
1743 |
+
variable_names: Optional[List[str]] = None,
|
1744 |
+
X_units: Optional[List[str]] = None,
|
1745 |
+
y_units: Optional[List[str]] = None,
|
1746 |
+
) -> "PySRRegressor":
|
1747 |
"""
|
1748 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
1749 |
|
|
|
2330 |
return "\n".join(preamble_string + [table_string])
|
2331 |
|
2332 |
|
2333 |
+
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
2334 |
"""Select an expression and return its index."""
|
2335 |
if model_selection == "accuracy":
|
2336 |
chosen_idx = equations["loss"].idxmin()
|
|
|
2345 |
f"{model_selection} is not a valid model selection strategy."
|
2346 |
)
|
2347 |
return chosen_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pysr/test/test.py
CHANGED
@@ -14,14 +14,9 @@ from sklearn.utils.estimator_checks import check_estimator
|
|
14 |
|
15 |
from .. import PySRRegressor, julia_helpers
|
16 |
from ..export_latex import sympy2latex
|
17 |
-
from ..
|
18 |
-
|
19 |
-
|
20 |
-
_handle_feature_selection,
|
21 |
-
_process_constraints,
|
22 |
-
idx_model_selection,
|
23 |
-
run_feature_selection,
|
24 |
-
)
|
25 |
|
26 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
27 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
|
|
14 |
|
15 |
from .. import PySRRegressor, julia_helpers
|
16 |
from ..export_latex import sympy2latex
|
17 |
+
from ..feature_selection import _handle_feature_selection, run_feature_selection
|
18 |
+
from ..sr import _check_assertions, _process_constraints, idx_model_selection
|
19 |
+
from ..utils import _csv_filename_to_pkl_filename
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
22 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
pysr/utils.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
from sklearn.utils.validation import _check_feature_names_in
|
5 |
+
|
6 |
+
|
7 |
+
def _csv_filename_to_pkl_filename(csv_filename: str) -> str:
|
8 |
+
if os.path.splitext(csv_filename)[1] == ".pkl":
|
9 |
+
return csv_filename
|
10 |
+
|
11 |
+
# Assume that the csv filename is of the form "foo.csv"
|
12 |
+
assert str(csv_filename).endswith(".csv")
|
13 |
+
|
14 |
+
dirname = str(os.path.dirname(csv_filename))
|
15 |
+
basename = str(os.path.basename(csv_filename))
|
16 |
+
base = str(os.path.splitext(basename)[0])
|
17 |
+
|
18 |
+
pkl_basename = base + ".pkl"
|
19 |
+
|
20 |
+
return os.path.join(dirname, pkl_basename)
|
21 |
+
|
22 |
+
|
23 |
+
_regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
|
24 |
+
_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
|
25 |
+
_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
|
26 |
+
|
27 |
+
_apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x)
|
28 |
+
_apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x)
|
29 |
+
_apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x)
|
30 |
+
|
31 |
+
|
32 |
+
def _preprocess_julia_floats(s: str) -> str:
|
33 |
+
if isinstance(s, str):
|
34 |
+
s = _apply_regexp_im(s)
|
35 |
+
s = _apply_regexp_im_sci(s)
|
36 |
+
s = _apply_regexp_sci(s)
|
37 |
+
return s
|
38 |
+
|
39 |
+
|
40 |
+
def _safe_check_feature_names_in(self, variable_names, generate_names=True):
|
41 |
+
"""_check_feature_names_in with compat for old versions."""
|
42 |
+
try:
|
43 |
+
return _check_feature_names_in(
|
44 |
+
self, variable_names, generate_names=generate_names
|
45 |
+
)
|
46 |
+
except TypeError:
|
47 |
+
return _check_feature_names_in(self, variable_names)
|
48 |
+
|
49 |
+
|
50 |
+
def _subscriptify(i: int) -> str:
|
51 |
+
"""Converts integer to subscript text form.
|
52 |
+
|
53 |
+
For example, 123 -> "₁₂₃".
|
54 |
+
"""
|
55 |
+
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|