Spaces:
Running
Running
MilesCranmer
commited on
Commit
•
9351408
1
Parent(s):
c41cf33
Change "best" model_selection to apply loss threshold
Browse files- pysr/sr.py +40 -23
pysr/sr.py
CHANGED
@@ -205,10 +205,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
205 |
Parameters
|
206 |
----------
|
207 |
model_selection : str, default="best"
|
208 |
-
Model selection criterion. Can be 'accuracy' or '
|
209 |
-
`"accuracy"` selects the candidate model with the lowest loss
|
210 |
-
|
211 |
-
the
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
214 |
List of strings giving the binary operators in Julia's Base.
|
@@ -469,7 +475,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
469 |
Whether to use a progress bar instead of printing to stdout.
|
470 |
|
471 |
equation_file : str, default=None
|
472 |
-
Where to save the files (
|
473 |
|
474 |
temp_equation_file : bool, default=False
|
475 |
Whether to put the hall of fame file in the temp directory.
|
@@ -943,12 +949,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
943 |
|
944 |
for i, equations in enumerate(all_equations):
|
945 |
selected = ["" for _ in range(len(equations))]
|
946 |
-
|
947 |
-
chosen_row = -1
|
948 |
-
elif self.model_selection == "best":
|
949 |
-
chosen_row = equations["score"].idxmax()
|
950 |
-
else:
|
951 |
-
raise NotImplementedError
|
952 |
selected[chosen_row] = ">>>>"
|
953 |
repr_equations = pd.DataFrame(
|
954 |
dict(
|
@@ -1091,18 +1092,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1091 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1092 |
return self.equations_.iloc[index]
|
1093 |
|
1094 |
-
if self.
|
1095 |
-
|
1096 |
-
|
1097 |
-
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
-
else:
|
1103 |
-
raise NotImplementedError(
|
1104 |
-
f"{self.model_selection} is not a valid model selection strategy."
|
1105 |
-
)
|
1106 |
|
1107 |
def _setup_equation_file(self):
|
1108 |
"""
|
@@ -2149,6 +2146,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2149 |
return ret_outputs[0]
|
2150 |
|
2151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2152 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
2153 |
"""Denoise the dataset using a Gaussian process"""
|
2154 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
|
205 |
Parameters
|
206 |
----------
|
207 |
model_selection : str, default="best"
|
208 |
+
Model selection criterion. Can be 'accuracy', 'best', or 'score'.
|
209 |
+
- `"accuracy"` selects the candidate model with the lowest loss
|
210 |
+
(highest accuracy).
|
211 |
+
- `"score"` selects the candidate model with the highest score.
|
212 |
+
Score is defined as the derivative of the log-loss with
|
213 |
+
respect to complexity - if an expression has a much better
|
214 |
+
oss at a slightly higher complexity, it is preferred.
|
215 |
+
- `"best"` selects the candidate model with the highest score
|
216 |
+
among expressions with a loss better than at least 1.5x the
|
217 |
+
most accurate model.
|
218 |
|
219 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
220 |
List of strings giving the binary operators in Julia's Base.
|
|
|
475 |
Whether to use a progress bar instead of printing to stdout.
|
476 |
|
477 |
equation_file : str, default=None
|
478 |
+
Where to save the files (.csv extension).
|
479 |
|
480 |
temp_equation_file : bool, default=False
|
481 |
Whether to put the hall of fame file in the temp directory.
|
|
|
949 |
|
950 |
for i, equations in enumerate(all_equations):
|
951 |
selected = ["" for _ in range(len(equations))]
|
952 |
+
chosen_row = idx_model_selection(equations, self.model_selection)
|
|
|
|
|
|
|
|
|
|
|
953 |
selected[chosen_row] = ">>>>"
|
954 |
repr_equations = pd.DataFrame(
|
955 |
dict(
|
|
|
1092 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1093 |
return self.equations_.iloc[index]
|
1094 |
|
1095 |
+
if isinstance(self.equations_, list):
|
1096 |
+
return [
|
1097 |
+
eq.iloc[idx_model_selection(eq, self.model_selection)]
|
1098 |
+
for eq in self.equations_
|
1099 |
+
]
|
1100 |
+
return self.equations_.iloc[
|
1101 |
+
idx_model_selection(self.equations_, self.model_selection)
|
1102 |
+
]
|
|
|
|
|
|
|
|
|
1103 |
|
1104 |
def _setup_equation_file(self):
|
1105 |
"""
|
|
|
2146 |
return ret_outputs[0]
|
2147 |
|
2148 |
|
2149 |
+
def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
2150 |
+
"""
|
2151 |
+
Return the index of the selected expression, given a dataframe of
|
2152 |
+
equations and a model selection.
|
2153 |
+
"""
|
2154 |
+
if model_selection == "accuracy":
|
2155 |
+
chosen_idx = equations["loss"].idxmin()
|
2156 |
+
elif model_selection == "best":
|
2157 |
+
threshold = 1.5 * equations["loss"].min()
|
2158 |
+
filtered_equations = equations.query(f"loss < {threshold}")
|
2159 |
+
chosen_idx = filtered_equations["score"].idxmax()
|
2160 |
+
elif model_selection == "score":
|
2161 |
+
chosen_idx = equations["score"].idxmax()
|
2162 |
+
else:
|
2163 |
+
raise NotImplementedError(
|
2164 |
+
f"{model_selection} is not a valid model selection strategy."
|
2165 |
+
)
|
2166 |
+
return chosen_idx
|
2167 |
+
|
2168 |
+
|
2169 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
2170 |
"""Denoise the dataset using a Gaussian process"""
|
2171 |
from sklearn.gaussian_process import GaussianProcessRegressor
|