MilesCranmer commited on
Commit
f653388
·
unverified ·
2 Parent(s): 476f573 291dc85

Merge pull request #609 from MilesCranmer/cleanup

Browse files

More extensive typing stubs and associated refactoring

.gitignore CHANGED
@@ -23,3 +23,5 @@ site
23
  **/*.code-workspace
24
  **/*.tar.gz
25
  venv
 
 
 
23
  **/*.code-workspace
24
  **/*.tar.gz
25
  venv
26
+ requirements-dev.lock
27
+ requirements.lock
environment.yml CHANGED
@@ -9,4 +9,3 @@ dependencies:
9
  - scikit-learn>=1.0.0,<2.0.0
10
  - pyjuliacall>=0.9.15,<0.10.0
11
  - click>=7.0.0,<9.0.0
12
- - typing_extensions>=4.0.0,<5.0.0
 
9
  - scikit-learn>=1.0.0,<2.0.0
10
  - pyjuliacall>=0.9.15,<0.10.0
11
  - click>=7.0.0,<9.0.0
 
pyproject.toml CHANGED
@@ -35,4 +35,10 @@ dev-dependencies = [
35
  "pre-commit>=3.7.0",
36
  "ipython>=8.23.0",
37
  "ipykernel>=6.29.4",
 
 
 
 
 
 
38
  ]
 
35
  "pre-commit>=3.7.0",
36
  "ipython>=8.23.0",
37
  "ipykernel>=6.29.4",
38
+ "mypy>=1.10.0",
39
+ "jax[cpu]>=0.4.26",
40
+ "torch>=2.3.0",
41
+ "pandas-stubs>=2.2.1.240316",
42
+ "types-pytz>=2024.1.0.20240417",
43
+ "types-openpyxl>=3.1.0.20240428",
44
  ]
pysr/denoising.py CHANGED
@@ -1,9 +1,17 @@
1
  """Functions for denoising data during preprocessing."""
2
 
 
 
3
  import numpy as np
 
4
 
5
 
6
- def denoise(X, y, Xresampled=None, random_state=None):
 
 
 
 
 
7
  """Denoise the dataset using a Gaussian process."""
8
  from sklearn.gaussian_process import GaussianProcessRegressor
9
  from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
15
  gpr.fit(X, y)
16
 
17
  if Xresampled is not None:
18
- return Xresampled, gpr.predict(Xresampled)
19
 
20
- return X, gpr.predict(X)
21
 
22
 
23
- def multi_denoise(X, y, Xresampled=None, random_state=None):
 
 
 
 
 
24
  """Perform `denoise` along each column of `y` independently."""
25
  y = np.stack(
26
  [
 
1
  """Functions for denoising data during preprocessing."""
2
 
3
+ from typing import Optional, Tuple, cast
4
+
5
  import numpy as np
6
+ from numpy import ndarray
7
 
8
 
9
+ def denoise(
10
+ X: ndarray,
11
+ y: ndarray,
12
+ Xresampled: Optional[ndarray] = None,
13
+ random_state: Optional[np.random.RandomState] = None,
14
+ ) -> Tuple[ndarray, ndarray]:
15
  """Denoise the dataset using a Gaussian process."""
16
  from sklearn.gaussian_process import GaussianProcessRegressor
17
  from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
 
23
  gpr.fit(X, y)
24
 
25
  if Xresampled is not None:
26
+ return Xresampled, cast(ndarray, gpr.predict(Xresampled))
27
 
28
+ return X, cast(ndarray, gpr.predict(X))
29
 
30
 
31
+ def multi_denoise(
32
+ X: ndarray,
33
+ y: ndarray,
34
+ Xresampled: Optional[ndarray] = None,
35
+ random_state: Optional[np.random.RandomState] = None,
36
+ ):
37
  """Perform `denoise` along each column of `y` independently."""
38
  y = np.stack(
39
  [
pysr/export_latex.py CHANGED
@@ -153,3 +153,15 @@ def sympy2multilatextable(
153
  ]
154
 
155
  return "\n\n".join(latex_tables)
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  ]
154
 
155
  return "\n\n".join(latex_tables)
156
+
157
+
158
+ def with_preamble(table_string: str) -> str:
159
+ preamble_string = [
160
+ r"\usepackage{breqn}",
161
+ r"\usepackage{booktabs}",
162
+ "",
163
+ "...",
164
+ "",
165
+ table_string,
166
+ ]
167
+ return "\n".join(preamble_string)
pysr/export_numpy.py CHANGED
@@ -1,10 +1,12 @@
1
  """Code for exporting discovered expressions to numpy"""
2
 
3
  import warnings
 
4
 
5
  import numpy as np
6
  import pandas as pd
7
- from sympy import lambdify
 
8
 
9
 
10
  def sympy2numpy(eqn, sympy_symbols, *, selection=None):
@@ -14,6 +16,10 @@ def sympy2numpy(eqn, sympy_symbols, *, selection=None):
14
  class CallableEquation:
15
  """Simple wrapper for numpy lambda functions built with sympy"""
16
 
 
 
 
 
17
  def __init__(self, eqn, sympy_symbols, selection=None):
18
  self._sympy = eqn
19
  self._sympy_symbols = sympy_symbols
@@ -29,8 +35,9 @@ class CallableEquation:
29
  return self._lambda(
30
  **{k: X[k].values for k in map(str, self._sympy_symbols)}
31
  ) * np.ones(expected_shape)
 
32
  if self._selection is not None:
33
- if X.shape[1] != len(self._selection):
34
  warnings.warn(
35
  "`X` should be of shape (n_samples, len(self._selection)). "
36
  "Automatically filtering `X` to selection. "
@@ -38,6 +45,7 @@ class CallableEquation:
38
  "this may lead to incorrect predictions and other errors."
39
  )
40
  X = X[:, self._selection]
 
41
  return self._lambda(*X.T) * np.ones(expected_shape)
42
 
43
  @property
 
1
  """Code for exporting discovered expressions to numpy"""
2
 
3
  import warnings
4
+ from typing import List, Union
5
 
6
  import numpy as np
7
  import pandas as pd
8
+ from numpy.typing import NDArray
9
+ from sympy import Expr, Symbol, lambdify
10
 
11
 
12
  def sympy2numpy(eqn, sympy_symbols, *, selection=None):
 
16
  class CallableEquation:
17
  """Simple wrapper for numpy lambda functions built with sympy"""
18
 
19
+ _sympy: Expr
20
+ _sympy_symbols: List[Symbol]
21
+ _selection: Union[NDArray[np.bool_], None]
22
+
23
  def __init__(self, eqn, sympy_symbols, selection=None):
24
  self._sympy = eqn
25
  self._sympy_symbols = sympy_symbols
 
35
  return self._lambda(
36
  **{k: X[k].values for k in map(str, self._sympy_symbols)}
37
  ) * np.ones(expected_shape)
38
+
39
  if self._selection is not None:
40
+ if X.shape[1] != self._selection.sum():
41
  warnings.warn(
42
  "`X` should be of shape (n_samples, len(self._selection)). "
43
  "Automatically filtering `X` to selection. "
 
45
  "this may lead to incorrect predictions and other errors."
46
  )
47
  X = X[:, self._selection]
48
+
49
  return self._lambda(*X.T) * np.ones(expected_shape)
50
 
51
  @property
pysr/export_sympy.py CHANGED
@@ -5,6 +5,8 @@ from typing import Callable, Dict, List, Optional
5
  import sympy
6
  from sympy import sympify
7
 
 
 
8
  sympy_mappings = {
9
  "div": lambda x, y: x / y,
10
  "mult": lambda x, y: x * y,
@@ -30,8 +32,8 @@ sympy_mappings = {
30
  "acosh": lambda x: sympy.acosh(x),
31
  "acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
32
  "asinh": sympy.asinh,
33
- "atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
34
- "atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
35
  "abs": abs,
36
  "mod": sympy.Mod,
37
  "erf": sympy.erf,
@@ -60,13 +62,13 @@ sympy_mappings = {
60
 
61
 
62
  def create_sympy_symbols_map(
63
- feature_names_in: List[str],
64
  ) -> Dict[str, sympy.Symbol]:
65
  return {variable: sympy.Symbol(variable) for variable in feature_names_in}
66
 
67
 
68
  def create_sympy_symbols(
69
- feature_names_in: List[str],
70
  ) -> List[sympy.Symbol]:
71
  return [sympy.Symbol(variable) for variable in feature_names_in]
72
 
@@ -74,7 +76,7 @@ def create_sympy_symbols(
74
  def pysr2sympy(
75
  equation: str,
76
  *,
77
- feature_names_in: Optional[List[str]] = None,
78
  extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
79
  ):
80
  if feature_names_in is None:
 
5
  import sympy
6
  from sympy import sympify
7
 
8
+ from .utils import ArrayLike
9
+
10
  sympy_mappings = {
11
  "div": lambda x, y: x / y,
12
  "mult": lambda x, y: x * y,
 
32
  "acosh": lambda x: sympy.acosh(x),
33
  "acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
34
  "asinh": sympy.asinh,
35
+ "atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
36
+ "atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
37
  "abs": abs,
38
  "mod": sympy.Mod,
39
  "erf": sympy.erf,
 
62
 
63
 
64
  def create_sympy_symbols_map(
65
+ feature_names_in: ArrayLike[str],
66
  ) -> Dict[str, sympy.Symbol]:
67
  return {variable: sympy.Symbol(variable) for variable in feature_names_in}
68
 
69
 
70
  def create_sympy_symbols(
71
+ feature_names_in: ArrayLike[str],
72
  ) -> List[sympy.Symbol]:
73
  return [sympy.Symbol(variable) for variable in feature_names_in]
74
 
 
76
  def pysr2sympy(
77
  equation: str,
78
  *,
79
+ feature_names_in: Optional[ArrayLike[str]] = None,
80
  extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
81
  ):
82
  if feature_names_in is None:
pysr/feature_selection.py CHANGED
@@ -1,9 +1,20 @@
1
  """Functions for doing feature selection during preprocessing."""
2
 
 
 
3
  import numpy as np
 
 
 
 
4
 
5
 
6
- def run_feature_selection(X, y, select_k_features, random_state=None):
 
 
 
 
 
7
  """
8
  Find most important features.
9
 
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
21
  selector = SelectFromModel(
22
  clf, threshold=-np.inf, max_features=select_k_features, prefit=True
23
  )
24
- return selector.get_support(indices=True)
25
 
26
 
27
  # Function has not been removed only due to usage in module tests
28
- def _handle_feature_selection(X, select_k_features, y, variable_names):
 
 
 
 
 
29
  if select_k_features is not None:
30
  selection = run_feature_selection(X, y, select_k_features)
31
  print(f"Using features {[variable_names[i] for i in selection]}")
 
1
  """Functions for doing feature selection during preprocessing."""
2
 
3
+ from typing import Optional, cast
4
+
5
  import numpy as np
6
+ from numpy import ndarray
7
+ from numpy.typing import NDArray
8
+
9
+ from .utils import ArrayLike
10
 
11
 
12
+ def run_feature_selection(
13
+ X: ndarray,
14
+ y: ndarray,
15
+ select_k_features: int,
16
+ random_state: Optional[np.random.RandomState] = None,
17
+ ) -> NDArray[np.bool_]:
18
  """
19
  Find most important features.
20
 
 
32
  selector = SelectFromModel(
33
  clf, threshold=-np.inf, max_features=select_k_features, prefit=True
34
  )
35
+ return cast(NDArray[np.bool_], selector.get_support(indices=False))
36
 
37
 
38
  # Function has not been removed only due to usage in module tests
39
+ def _handle_feature_selection(
40
+ X: ndarray,
41
+ select_k_features: Optional[int],
42
+ y: ndarray,
43
+ variable_names: ArrayLike[str],
44
+ ):
45
  if select_k_features is not None:
46
  selection = run_feature_selection(X, y, select_k_features)
47
  print(f"Using features {[variable_names[i] for i in selection]}")
pysr/julia_helpers.py CHANGED
@@ -1,11 +1,16 @@
1
  """Functions for initializing the Julia environment and installing deps."""
2
 
 
 
3
  import numpy as np
4
  from juliacall import convert as jl_convert # type: ignore
 
5
 
6
  from .deprecated import init_julia, install
7
  from .julia_import import jl
8
 
 
 
9
  jl.seval("using Serialization: Serialization")
10
  jl.seval("using PythonCall: PythonCall")
11
 
@@ -22,24 +27,31 @@ def _escape_filename(filename):
22
  return str_repr
23
 
24
 
25
- def _load_cluster_manager(cluster_manager):
26
  jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
27
  return jl.seval(f"addprocs_{cluster_manager}")
28
 
29
 
30
- def jl_array(x):
31
  if x is None:
32
  return None
33
- return jl_convert(jl.Array, x)
 
 
 
 
 
 
 
34
 
35
 
36
- def jl_serialize(obj):
37
  buf = jl.IOBuffer()
38
  Serialization.serialize(buf, obj)
39
  return np.array(jl.take_b(buf))
40
 
41
 
42
- def jl_deserialize(s):
43
  if s is None:
44
  return s
45
  buf = jl.IOBuffer()
 
1
  """Functions for initializing the Julia environment and installing deps."""
2
 
3
+ from typing import Any, Callable, Union, cast
4
+
5
  import numpy as np
6
  from juliacall import convert as jl_convert # type: ignore
7
+ from numpy.typing import NDArray
8
 
9
  from .deprecated import init_julia, install
10
  from .julia_import import jl
11
 
12
+ jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
13
+
14
  jl.seval("using Serialization: Serialization")
15
  jl.seval("using PythonCall: PythonCall")
16
 
 
27
  return str_repr
28
 
29
 
30
+ def _load_cluster_manager(cluster_manager: str):
31
  jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
32
  return jl.seval(f"addprocs_{cluster_manager}")
33
 
34
 
35
+ def jl_array(x, dtype=None):
36
  if x is None:
37
  return None
38
+ elif dtype is None:
39
+ return jl_convert(jl.Array, x)
40
+ else:
41
+ return jl_convert(jl.Array[dtype], x)
42
+
43
+
44
+ def jl_is_function(f) -> bool:
45
+ return cast(bool, jl.seval("op -> op isa Function")(f))
46
 
47
 
48
+ def jl_serialize(obj: Any) -> NDArray[np.uint8]:
49
  buf = jl.IOBuffer()
50
  Serialization.serialize(buf, obj)
51
  return np.array(jl.take_b(buf))
52
 
53
 
54
+ def jl_deserialize(s: Union[NDArray[np.uint8], None]):
55
  if s is None:
56
  return s
57
  buf = jl.IOBuffer()
pysr/julia_import.py CHANGED
@@ -1,6 +1,8 @@
1
  import os
2
  import sys
3
  import warnings
 
 
4
 
5
  # Check if JuliaCall is already loaded, and if so, warn the user
6
  # about the relevant environment variables. If not loaded,
@@ -42,6 +44,9 @@ if autoload_extensions is not None:
42
 
43
  from juliacall import Main as jl # type: ignore
44
 
 
 
 
45
  jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
46
 
47
  jl.seval("using SymbolicRegression")
 
1
  import os
2
  import sys
3
  import warnings
4
+ from types import ModuleType
5
+ from typing import cast
6
 
7
  # Check if JuliaCall is already loaded, and if so, warn the user
8
  # about the relevant environment variables. If not loaded,
 
44
 
45
  from juliacall import Main as jl # type: ignore
46
 
47
+ jl = cast(ModuleType, jl)
48
+
49
+
50
  jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
51
 
52
  jl.seval("using SymbolicRegression")
pysr/sr.py CHANGED
@@ -8,27 +8,31 @@ import shutil
8
  import sys
9
  import tempfile
10
  import warnings
 
11
  from datetime import datetime
12
  from io import StringIO
13
  from multiprocessing import cpu_count
14
  from pathlib import Path
15
- from typing import Callable, Dict, List, Optional, Tuple, Union
16
-
17
- if sys.version_info >= (3, 8):
18
- from typing import Literal
19
- else:
20
- from typing_extensions import Literal
21
 
22
  import numpy as np
23
  import pandas as pd
 
 
24
  from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
25
  from sklearn.utils import check_array, check_consistent_length, check_random_state
26
- from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
 
27
 
28
  from .denoising import denoise, multi_denoise
29
  from .deprecated import DEPRECATED_KWARGS
30
  from .export_jax import sympy2jax
31
- from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
 
 
 
 
 
32
  from .export_numpy import sympy2numpy
33
  from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
34
  from .export_torch import sympy2torch
@@ -40,17 +44,20 @@ from .julia_helpers import (
40
  _load_cluster_manager,
41
  jl_array,
42
  jl_deserialize,
 
43
  jl_serialize,
44
  )
45
  from .julia_import import SymbolicRegression, jl
46
  from .utils import (
 
 
47
  _csv_filename_to_pkl_filename,
48
  _preprocess_julia_floats,
49
  _safe_check_feature_names_in,
50
  _subscriptify,
51
  )
52
 
53
- already_ran = False
54
 
55
 
56
  def _process_constraints(binary_operators, unary_operators, constraints):
@@ -178,6 +185,21 @@ def _check_assertions(
178
  VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
179
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
182
  """
183
  High-performance symbolic regression algorithm.
@@ -606,22 +628,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
606
  Units of each variable in the training dataset, `y`.
607
  nout_ : int
608
  Number of output dimensions.
609
- selection_mask_ : list[int] of length `select_k_features`
610
- List of indices for input features that are selected when
611
- `select_k_features` is set.
612
  tempdir_ : Path
613
  Path to the temporary equations directory.
614
- equation_file_ : str
615
  Output equation file name produced by the julia backend.
616
  julia_state_stream_ : ndarray
617
  The serialized state for the julia SymbolicRegression.jl backend (after fitting),
618
  stored as an array of uint8, produced by Julia's Serialization.serialize function.
619
- julia_state_
620
- The deserialized state.
621
  julia_options_stream_ : ndarray
622
  The serialized julia options, stored as an array of uint8,
623
- julia_options_
624
- The deserialized julia options.
625
  equation_file_contents_ : list[pandas.DataFrame]
626
  Contents of the equation file output by the Julia backend.
627
  show_pickle_warnings_ : bool
@@ -668,6 +685,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
668
  ```
669
  """
670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
  def __init__(
672
  self,
673
  model_selection: Literal["best", "accuracy", "score"] = "best",
@@ -900,14 +932,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
900
  @classmethod
901
  def from_file(
902
  cls,
903
- equation_file,
904
  *,
905
- binary_operators=None,
906
- unary_operators=None,
907
- n_features_in=None,
908
- feature_names_in=None,
909
- selection_mask=None,
910
- nout=1,
911
  **pysr_kwargs,
912
  ):
913
  """
@@ -915,7 +947,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
915
 
916
  Parameters
917
  ----------
918
- equation_file : str
919
  Path to a pickle file containing a saved model, or a csv file
920
  containing equations.
921
  binary_operators : list[str]
@@ -930,8 +962,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
930
  feature_names_in : list[str]
931
  Names of the features passed to the model.
932
  Not needed if loading from a pickle file.
933
- selection_mask : list[bool]
934
- If using select_k_features, you must pass `model.selection_mask_` here.
935
  Not needed if loading from a pickle file.
936
  nout : int
937
  Number of outputs of the model.
@@ -982,7 +1014,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
982
 
983
  # TODO: copy .bkup file if exists.
984
  model = cls(
985
- equation_file=equation_file,
986
  binary_operators=binary_operators,
987
  unary_operators=unary_operators,
988
  **pysr_kwargs,
@@ -1002,7 +1034,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1002
  model.display_feature_names_in_ = feature_names_in
1003
 
1004
  if selection_mask is None:
1005
- model.selection_mask_ = np.ones(n_features_in, dtype=bool)
1006
  else:
1007
  model.selection_mask_ = selection_mask
1008
 
@@ -1029,7 +1061,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1029
  all_equations = equations
1030
 
1031
  for i, equations in enumerate(all_equations):
1032
- selected = ["" for _ in range(len(equations))]
1033
  chosen_row = idx_model_selection(equations, self.model_selection)
1034
  selected[chosen_row] = ">>>>"
1035
  repr_equations = pd.DataFrame(
@@ -1129,10 +1161,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1129
 
1130
  @property
1131
  def julia_options_(self):
 
1132
  return jl_deserialize(self.julia_options_stream_)
1133
 
1134
  @property
1135
  def julia_state_(self):
 
1136
  return jl_deserialize(self.julia_state_stream_)
1137
 
1138
  @property
@@ -1145,7 +1179,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1145
  )
1146
  return self.julia_state_
1147
 
1148
- def get_best(self, index=None):
1149
  """
1150
  Get best equation using `model_selection`.
1151
 
@@ -1168,8 +1202,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1168
  Raised when an invalid model selection strategy is provided.
1169
  """
1170
  check_is_fitted(self, attributes=["equations_"])
1171
- if self.equations_ is None:
1172
- raise ValueError("No equations have been generated yet.")
1173
 
1174
  if index is not None:
1175
  if isinstance(self.equations_, list):
@@ -1177,16 +1209,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1177
  index, list
1178
  ), "With multiple output features, index must be a list."
1179
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1180
- return self.equations_.iloc[index]
 
 
1181
 
1182
  if isinstance(self.equations_, list):
1183
  return [
1184
- eq.iloc[idx_model_selection(eq, self.model_selection)]
1185
  for eq in self.equations_
1186
  ]
1187
- return self.equations_.iloc[
1188
- idx_model_selection(self.equations_, self.model_selection)
1189
- ]
 
 
 
1190
 
1191
  def _setup_equation_file(self):
1192
  """
@@ -1211,7 +1248,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1211
  self.equation_file_ = self.equation_file
1212
  self.equation_file_contents_ = None
1213
 
1214
- def _validate_and_set_init_params(self):
1215
  """
1216
  Ensure parameters passed at initialization are valid.
1217
 
@@ -1269,59 +1306,48 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1269
  f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
1270
  )
1271
 
1272
- progress = self.progress
1273
- # 'Mutable' parameter validation
1274
- # (Params and their default values, if None is given:)
1275
- default_param_mapping = {
1276
- "binary_operators": "+ * - /".split(" "),
1277
- "unary_operators": [],
1278
- "maxdepth": self.maxsize,
1279
- "constraints": {},
1280
- "multithreading": self.procs != 0 and self.cluster_manager is None,
1281
- "batch_size": 1,
1282
- "update_verbosity": int(self.verbosity),
1283
- "progress": progress,
1284
- }
1285
- packed_modified_params = {}
1286
- for parameter, default_value in default_param_mapping.items():
1287
- parameter_value = getattr(self, parameter)
1288
- if parameter_value is None:
1289
- parameter_value = default_value
1290
  else:
1291
- # Special cases such as when binary_operators is a string
1292
- if parameter in ["binary_operators", "unary_operators"] and isinstance(
1293
- parameter_value, str
1294
- ):
1295
- parameter_value = [parameter_value]
1296
- elif parameter == "batch_size" and parameter_value < 1:
1297
- warnings.warn(
1298
- "Given `batch_size` must be greater than or equal to one. "
1299
- "`batch_size` has been increased to equal one."
1300
- )
1301
- parameter_value = 1
1302
- elif (
1303
- parameter == "progress"
1304
- and parameter_value
1305
- and "buffer" not in sys.stdout.__dir__()
1306
- ):
1307
- warnings.warn(
1308
- "Note: it looks like you are running in Jupyter. "
1309
- "The progress bar will be turned off."
1310
- )
1311
- parameter_value = False
1312
- packed_modified_params[parameter] = parameter_value
1313
 
1314
  assert (
1315
- len(packed_modified_params["binary_operators"])
1316
- + len(packed_modified_params["unary_operators"])
1317
- > 0
1318
- )
1319
 
1320
- return packed_modified_params
1321
 
1322
  def _validate_and_set_fit_params(
1323
  self, X, y, Xresampled, weights, variable_names, X_units, y_units
1324
- ):
 
 
 
 
 
 
 
 
1325
  """
1326
  Validate the parameters passed to the :term`fit` method.
1327
 
@@ -1341,7 +1367,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1341
  Weight array of the same shape as `y`.
1342
  Each element is how to weight the mean-square-error loss
1343
  for that particular element of y.
1344
- variable_names : list[str] of length n_features
1345
  Names of each variable in the training dataset, `X`.
1346
  X_units : list[str] of length n_features
1347
  Units of each variable in the training dataset, `X`.
@@ -1397,7 +1423,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1397
  if weights is not None:
1398
  weights = check_array(weights, ensure_2d=False)
1399
  check_consistent_length(weights, y)
1400
- X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1401
  self.feature_names_in_ = _safe_check_feature_names_in(
1402
  self, variable_names, generate_names=False
1403
  )
@@ -1407,10 +1433,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1407
  self.display_feature_names_in_ = np.array(
1408
  [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
1409
  )
 
1410
  else:
1411
  self.display_feature_names_in_ = self.feature_names_in_
1412
-
1413
- variable_names = self.feature_names_in_
1414
 
1415
  # Handle multioutput data
1416
  if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
@@ -1425,8 +1451,23 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1425
 
1426
  return X, y, Xresampled, weights, variable_names, X_units, y_units
1427
 
 
 
 
 
 
 
 
 
1428
  def _pre_transform_training_data(
1429
- self, X, y, Xresampled, variable_names, X_units, y_units, random_state
 
 
 
 
 
 
 
1430
  ):
1431
  """
1432
  Transform the training data before fitting the symbolic regressor.
@@ -1435,12 +1476,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1435
 
1436
  Parameters
1437
  ----------
1438
- X : ndarray | pandas.DataFrame
1439
  Training data of shape (n_samples, n_features).
1440
- y : ndarray | pandas.DataFrame
1441
  Target values of shape (n_samples,) or (n_samples, n_targets).
1442
  Will be cast to X's dtype if necessary.
1443
- Xresampled : ndarray | pandas.DataFrame
1444
  Resampled training data, of shape `(n_resampled, n_features)`,
1445
  used for denoising.
1446
  variable_names : list[str]
@@ -1478,24 +1519,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1478
  """
1479
  # Feature selection transformation
1480
  if self.select_k_features:
1481
- self.selection_mask_ = run_feature_selection(
1482
  X, y, self.select_k_features, random_state=random_state
1483
  )
1484
- X = X[:, self.selection_mask_]
1485
 
1486
  if Xresampled is not None:
1487
- Xresampled = Xresampled[:, self.selection_mask_]
1488
 
1489
  # Reduce variable_names to selection
1490
- variable_names = [variable_names[i] for i in self.selection_mask_]
 
 
 
 
 
 
 
1491
 
1492
  if X_units is not None:
1493
- X_units = [X_units[i] for i in self.selection_mask_]
 
 
 
1494
  self.X_units_ = copy.deepcopy(X_units)
1495
 
1496
  # Re-perform data validation and feature name updating
1497
- X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1498
  # Update feature names with selected variable names
 
1499
  self.feature_names_in_ = _check_feature_names_in(self, variable_names)
1500
  self.display_feature_names_in_ = self.feature_names_in_
1501
  print(f"Using features {self.feature_names_in_}")
@@ -1511,20 +1563,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1511
 
1512
  return X, y, variable_names, X_units, y_units
1513
 
1514
- def _run(self, X, y, mutated_params, weights, seed):
 
 
 
 
 
 
 
1515
  """
1516
  Run the symbolic regression fitting process on the julia backend.
1517
 
1518
  Parameters
1519
  ----------
1520
- X : ndarray | pandas.DataFrame
1521
  Training data of shape `(n_samples, n_features)`.
1522
- y : ndarray | pandas.DataFrame
1523
  Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
1524
  Will be cast to `X`'s dtype if necessary.
1525
- mutated_params : dict[str, Any]
1526
- Dictionary of mutated versions of some parameters passed in __init__.
1527
- weights : ndarray | pandas.DataFrame
1528
  Weight array of the same shape as `y`.
1529
  Each element is how to weight the mean-square-error loss
1530
  for that particular element of y.
@@ -1543,24 +1602,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1543
  """
1544
  # Need to be global as we don't want to recreate/reinstate julia for
1545
  # every new instance of PySRRegressor
1546
- global already_ran
1547
 
1548
  # These are the parameters which may be modified from the ones
1549
  # specified in init, so we define them here locally:
1550
- binary_operators = mutated_params["binary_operators"]
1551
- unary_operators = mutated_params["unary_operators"]
1552
- maxdepth = mutated_params["maxdepth"]
1553
- constraints = mutated_params["constraints"]
 
 
 
 
 
 
1554
  nested_constraints = self.nested_constraints
1555
  complexity_of_operators = self.complexity_of_operators
1556
- multithreading = mutated_params["multithreading"]
1557
  cluster_manager = self.cluster_manager
1558
- batch_size = mutated_params["batch_size"]
1559
- update_verbosity = mutated_params["update_verbosity"]
1560
- progress = mutated_params["progress"]
1561
 
1562
  # Start julia backend processes
1563
- if not already_ran and update_verbosity != 0:
1564
  print("Compiling Julia backend...")
1565
 
1566
  if cluster_manager is not None:
@@ -1599,6 +1660,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1599
  complexity_of_operators_str += f"({k}) => {v}, "
1600
  complexity_of_operators_str += ")"
1601
  complexity_of_operators = jl.seval(complexity_of_operators_str)
 
1602
 
1603
  custom_loss = jl.seval(
1604
  str(self.elementwise_loss)
@@ -1635,11 +1697,25 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1635
  optimize=self.weight_optimize,
1636
  )
1637
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1638
  # Call to Julia backend.
1639
  # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
1640
  options = SymbolicRegression.Options(
1641
- binary_operators=jl.seval(str(binary_operators).replace("'", "")),
1642
- unary_operators=jl.seval(str(unary_operators).replace("'", "")),
1643
  bin_constraints=jl_array(bin_constraints),
1644
  una_constraints=jl_array(una_constraints),
1645
  complexity_of_operators=complexity_of_operators,
@@ -1671,9 +1747,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1671
  fraction_replaced_hof=self.fraction_replaced_hof,
1672
  should_simplify=self.should_simplify,
1673
  should_optimize_constants=self.should_optimize_constants,
1674
- warmup_maxsize_by=(
1675
- 0.0 if self.warmup_maxsize_by is None else self.warmup_maxsize_by
1676
- ),
1677
  use_frequency=self.use_frequency,
1678
  use_frequency_in_tournament=self.use_frequency_in_tournament,
1679
  adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
@@ -1780,7 +1854,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1780
  if self.delete_tempfiles:
1781
  shutil.rmtree(self.tempdir_)
1782
 
1783
- already_ran = True
1784
 
1785
  return self
1786
 
@@ -1790,9 +1864,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1790
  y,
1791
  Xresampled=None,
1792
  weights=None,
1793
- variable_names: Optional[List[str]] = None,
1794
- X_units: Optional[List[str]] = None,
1795
- y_units: Optional[List[str]] = None,
1796
  ) -> "PySRRegressor":
1797
  """
1798
  Search for equations to fit the dataset and store them in `self.equations_`.
@@ -1854,12 +1928,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1854
  self.X_units_ = None
1855
  self.y_units_ = None
1856
 
1857
- random_state = check_random_state(self.random_state) # For np random
1858
- seed = random_state.get_state()[1][0] # For julia random
1859
-
1860
  self._setup_equation_file()
1861
 
1862
- mutated_params = self._validate_and_set_init_params()
1863
 
1864
  (
1865
  X,
@@ -1884,6 +1955,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1884
  "More datapoints will lower the search speed."
1885
  )
1886
 
 
 
 
1887
  # Pre transformations (feature selection and denoising)
1888
  X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
1889
  X, y, Xresampled, variable_names, X_units, y_units, random_state
@@ -1925,7 +1999,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1925
  self._checkpoint()
1926
 
1927
  # Perform the search:
1928
- self._run(X, y, mutated_params, weights=weights, seed=seed)
1929
 
1930
  # Then, after fit, we save again, so the pickle file contains
1931
  # the equations:
@@ -1934,7 +2008,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1934
 
1935
  return self
1936
 
1937
- def refresh(self, checkpoint_file=None):
1938
  """
1939
  Update self.equations_ with any new options passed.
1940
 
@@ -1943,11 +2017,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1943
 
1944
  Parameters
1945
  ----------
1946
- checkpoint_file : str
1947
  Path to checkpoint hall of fame file to be loaded.
1948
  The default will use the set `equation_file_`.
1949
  """
1950
- if checkpoint_file:
1951
  self.equation_file_ = checkpoint_file
1952
  self.equation_file_contents_ = None
1953
  check_is_fitted(self, attributes=["equation_file_"])
@@ -1999,7 +2073,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1999
  if self.selection_mask_ is not None:
2000
  # RangeIndex enforces column order allowing columns to
2001
  # be correctly filtered with self.selection_mask_
2002
- X = X.iloc[:, self.selection_mask_]
2003
  X.columns = self.feature_names_in_
2004
  # Without feature information, CallableEquation/lambda_format equations
2005
  # require that the column order of X matches that of the X used during
@@ -2009,14 +2083,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2009
  # reordered/reindexed to match those of the transformed (denoised and
2010
  # feature selected) X in fit.
2011
  X = X.reindex(columns=self.feature_names_in_)
2012
- X = self._validate_data(X, reset=False)
2013
 
2014
  try:
2015
- if self.nout_ > 1:
 
2016
  return np.stack(
2017
  [eq["lambda_format"](X) for eq in best_equation], axis=1
2018
  )
2019
- return best_equation["lambda_format"](X)
 
2020
  except Exception as error:
2021
  raise ValueError(
2022
  "Failed to evaluate the expression. "
@@ -2046,9 +2122,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2046
  """
2047
  self.refresh()
2048
  best_equation = self.get_best(index=index)
2049
- if self.nout_ > 1:
 
2050
  return [eq["sympy_format"] for eq in best_equation]
2051
- return best_equation["sympy_format"]
 
2052
 
2053
  def latex(self, index=None, precision=3):
2054
  """
@@ -2108,9 +2186,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2108
  self.set_params(output_jax_format=True)
2109
  self.refresh()
2110
  best_equation = self.get_best(index=index)
2111
- if self.nout_ > 1:
 
2112
  return [eq["jax_format"] for eq in best_equation]
2113
- return best_equation["jax_format"]
 
2114
 
2115
  def pytorch(self, index=None):
2116
  """
@@ -2138,9 +2218,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2138
  self.set_params(output_torch_format=True)
2139
  self.refresh()
2140
  best_equation = self.get_best(index=index)
2141
- if self.nout_ > 1:
2142
  return [eq["torch_format"] for eq in best_equation]
2143
- return best_equation["torch_format"]
 
2144
 
2145
  def _read_equation_file(self):
2146
  """Read the hall of fame file created by `SymbolicRegression.jl`."""
@@ -2239,10 +2320,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2239
  lastComplexity = 0
2240
  sympy_format = []
2241
  lambda_format = []
2242
- if self.output_jax_format:
2243
- jax_format = []
2244
- if self.output_torch_format:
2245
- torch_format = []
2246
 
2247
  for _, eqn_row in output.iterrows():
2248
  eqn = pysr2sympy(
@@ -2354,7 +2433,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2354
  """
2355
  self.refresh()
2356
 
2357
- if self.nout_ > 1:
2358
  if indices is not None:
2359
  assert isinstance(indices, list)
2360
  assert isinstance(indices[0], list)
@@ -2363,7 +2442,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2363
  table_string = sympy2multilatextable(
2364
  self.equations_, indices=indices, precision=precision, columns=columns
2365
  )
2366
- else:
2367
  if indices is not None:
2368
  assert isinstance(indices, list)
2369
  assert isinstance(indices[0], int)
@@ -2371,15 +2450,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2371
  table_string = sympy2latextable(
2372
  self.equations_, indices=indices, precision=precision, columns=columns
2373
  )
 
 
 
 
 
2374
 
2375
- preamble_string = [
2376
- r"\usepackage{breqn}",
2377
- r"\usepackage{booktabs}",
2378
- "",
2379
- "...",
2380
- "",
2381
- ]
2382
- return "\n".join(preamble_string + [table_string])
2383
 
2384
 
2385
  def idx_model_selection(equations: pd.DataFrame, model_selection: str):
@@ -2397,3 +2474,30 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
2397
  f"{model_selection} is not a valid model selection strategy."
2398
  )
2399
  return chosen_idx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import sys
9
  import tempfile
10
  import warnings
11
+ from dataclasses import dataclass, fields
12
  from datetime import datetime
13
  from io import StringIO
14
  from multiprocessing import cpu_count
15
  from pathlib import Path
16
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast
 
 
 
 
 
17
 
18
  import numpy as np
19
  import pandas as pd
20
+ from numpy import ndarray
21
+ from numpy.typing import NDArray
22
  from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
23
  from sklearn.utils import check_array, check_consistent_length, check_random_state
24
+ from sklearn.utils.validation import _check_feature_names_in # type: ignore
25
+ from sklearn.utils.validation import check_is_fitted
26
 
27
  from .denoising import denoise, multi_denoise
28
  from .deprecated import DEPRECATED_KWARGS
29
  from .export_jax import sympy2jax
30
+ from .export_latex import (
31
+ sympy2latex,
32
+ sympy2latextable,
33
+ sympy2multilatextable,
34
+ with_preamble,
35
+ )
36
  from .export_numpy import sympy2numpy
37
  from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
38
  from .export_torch import sympy2torch
 
44
  _load_cluster_manager,
45
  jl_array,
46
  jl_deserialize,
47
+ jl_is_function,
48
  jl_serialize,
49
  )
50
  from .julia_import import SymbolicRegression, jl
51
  from .utils import (
52
+ ArrayLike,
53
+ PathLike,
54
  _csv_filename_to_pkl_filename,
55
  _preprocess_julia_floats,
56
  _safe_check_feature_names_in,
57
  _subscriptify,
58
  )
59
 
60
+ ALREADY_RAN = False
61
 
62
 
63
  def _process_constraints(binary_operators, unary_operators, constraints):
 
185
  VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
186
 
187
 
188
+ @dataclass
189
+ class _DynamicallySetParams:
190
+ """Defines some parameters that are set at runtime."""
191
+
192
+ binary_operators: List[str]
193
+ unary_operators: List[str]
194
+ maxdepth: int
195
+ constraints: Dict[str, str]
196
+ multithreading: bool
197
+ batch_size: int
198
+ update_verbosity: int
199
+ progress: bool
200
+ warmup_maxsize_by: float
201
+
202
+
203
  class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
204
  """
205
  High-performance symbolic regression algorithm.
 
628
  Units of each variable in the training dataset, `y`.
629
  nout_ : int
630
  Number of output dimensions.
631
+ selection_mask_ : ndarray of shape (`n_features_in_`,)
632
+ Mask of which features of `X` to use when `select_k_features` is set.
 
633
  tempdir_ : Path
634
  Path to the temporary equations directory.
635
+ equation_file_ : Union[str, Path]
636
  Output equation file name produced by the julia backend.
637
  julia_state_stream_ : ndarray
638
  The serialized state for the julia SymbolicRegression.jl backend (after fitting),
639
  stored as an array of uint8, produced by Julia's Serialization.serialize function.
 
 
640
  julia_options_stream_ : ndarray
641
  The serialized julia options, stored as an array of uint8,
 
 
642
  equation_file_contents_ : list[pandas.DataFrame]
643
  Contents of the equation file output by the Julia backend.
644
  show_pickle_warnings_ : bool
 
685
  ```
686
  """
687
 
688
+ equations_: Union[pd.DataFrame, List[pd.DataFrame], None]
689
+ n_features_in_: int
690
+ feature_names_in_: ArrayLike[str]
691
+ display_feature_names_in_: ArrayLike[str]
692
+ X_units_: Union[ArrayLike[str], None]
693
+ y_units_: Union[str, ArrayLike[str], None]
694
+ nout_: int
695
+ selection_mask_: Union[NDArray[np.bool_], None]
696
+ tempdir_: Path
697
+ equation_file_: PathLike
698
+ julia_state_stream_: Union[NDArray[np.uint8], None]
699
+ julia_options_stream_: Union[NDArray[np.uint8], None]
700
+ equation_file_contents_: Union[List[pd.DataFrame], None]
701
+ show_pickle_warnings_: bool
702
+
703
  def __init__(
704
  self,
705
  model_selection: Literal["best", "accuracy", "score"] = "best",
 
932
  @classmethod
933
  def from_file(
934
  cls,
935
+ equation_file: PathLike,
936
  *,
937
+ binary_operators: Optional[List[str]] = None,
938
+ unary_operators: Optional[List[str]] = None,
939
+ n_features_in: Optional[int] = None,
940
+ feature_names_in: Optional[ArrayLike[str]] = None,
941
+ selection_mask: Optional[NDArray[np.bool_]] = None,
942
+ nout: int = 1,
943
  **pysr_kwargs,
944
  ):
945
  """
 
947
 
948
  Parameters
949
  ----------
950
+ equation_file : str or Path
951
  Path to a pickle file containing a saved model, or a csv file
952
  containing equations.
953
  binary_operators : list[str]
 
962
  feature_names_in : list[str]
963
  Names of the features passed to the model.
964
  Not needed if loading from a pickle file.
965
+ selection_mask : NDArray[np.bool_]
966
+ If using `select_k_features`, you must pass `model.selection_mask_` here.
967
  Not needed if loading from a pickle file.
968
  nout : int
969
  Number of outputs of the model.
 
1014
 
1015
  # TODO: copy .bkup file if exists.
1016
  model = cls(
1017
+ equation_file=str(equation_file),
1018
  binary_operators=binary_operators,
1019
  unary_operators=unary_operators,
1020
  **pysr_kwargs,
 
1034
  model.display_feature_names_in_ = feature_names_in
1035
 
1036
  if selection_mask is None:
1037
+ model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_)
1038
  else:
1039
  model.selection_mask_ = selection_mask
1040
 
 
1061
  all_equations = equations
1062
 
1063
  for i, equations in enumerate(all_equations):
1064
+ selected = pd.Series([""] * len(equations), index=equations.index)
1065
  chosen_row = idx_model_selection(equations, self.model_selection)
1066
  selected[chosen_row] = ">>>>"
1067
  repr_equations = pd.DataFrame(
 
1161
 
1162
  @property
1163
  def julia_options_(self):
1164
+ """The deserialized julia options."""
1165
  return jl_deserialize(self.julia_options_stream_)
1166
 
1167
  @property
1168
  def julia_state_(self):
1169
+ """The deserialized state."""
1170
  return jl_deserialize(self.julia_state_stream_)
1171
 
1172
  @property
 
1179
  )
1180
  return self.julia_state_
1181
 
1182
+ def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
1183
  """
1184
  Get best equation using `model_selection`.
1185
 
 
1202
  Raised when an invalid model selection strategy is provided.
1203
  """
1204
  check_is_fitted(self, attributes=["equations_"])
 
 
1205
 
1206
  if index is not None:
1207
  if isinstance(self.equations_, list):
 
1209
  index, list
1210
  ), "With multiple output features, index must be a list."
1211
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1212
+ else:
1213
+ equations_ = cast(pd.DataFrame, self.equations_)
1214
+ return cast(pd.Series, equations_.iloc[index])
1215
 
1216
  if isinstance(self.equations_, list):
1217
  return [
1218
+ cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
1219
  for eq in self.equations_
1220
  ]
1221
+ else:
1222
+ equations_ = cast(pd.DataFrame, self.equations_)
1223
+ return cast(
1224
+ pd.Series,
1225
+ equations_.loc[idx_model_selection(equations_, self.model_selection)],
1226
+ )
1227
 
1228
  def _setup_equation_file(self):
1229
  """
 
1248
  self.equation_file_ = self.equation_file
1249
  self.equation_file_contents_ = None
1250
 
1251
+ def _validate_and_modify_params(self) -> _DynamicallySetParams:
1252
  """
1253
  Ensure parameters passed at initialization are valid.
1254
 
 
1306
  f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
1307
  )
1308
 
1309
+ param_container = _DynamicallySetParams(
1310
+ binary_operators=["+", "*", "-", "/"],
1311
+ unary_operators=[],
1312
+ maxdepth=self.maxsize,
1313
+ constraints={},
1314
+ multithreading=self.procs != 0 and self.cluster_manager is None,
1315
+ batch_size=1,
1316
+ update_verbosity=int(self.verbosity),
1317
+ progress=self.progress,
1318
+ warmup_maxsize_by=0.0,
1319
+ )
1320
+
1321
+ for param_name in map(lambda x: x.name, fields(_DynamicallySetParams)):
1322
+ user_param_value = getattr(self, param_name)
1323
+ if user_param_value is None:
1324
+ # Leave as the default in DynamicallySetParams
1325
+ ...
 
1326
  else:
1327
+ # If user has specified it, we will override the default.
1328
+ # However, there are some special cases to mutate it:
1329
+ new_param_value = _mutate_parameter(param_name, user_param_value)
1330
+ setattr(param_container, param_name, new_param_value)
1331
+ # TODO: This should just be part of the __init__ of _DynamicallySetParams
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1332
 
1333
  assert (
1334
+ len(param_container.binary_operators) > 0
1335
+ or len(param_container.unary_operators) > 0
1336
+ ), "At least one operator must be provided."
 
1337
 
1338
+ return param_container
1339
 
1340
  def _validate_and_set_fit_params(
1341
  self, X, y, Xresampled, weights, variable_names, X_units, y_units
1342
+ ) -> Tuple[
1343
+ ndarray,
1344
+ ndarray,
1345
+ Optional[ndarray],
1346
+ Optional[ndarray],
1347
+ ArrayLike[str],
1348
+ Optional[ArrayLike[str]],
1349
+ Optional[Union[str, ArrayLike[str]]],
1350
+ ]:
1351
  """
1352
  Validate the parameters passed to the :term`fit` method.
1353
 
 
1367
  Weight array of the same shape as `y`.
1368
  Each element is how to weight the mean-square-error loss
1369
  for that particular element of y.
1370
+ variable_names : ndarray of length n_features
1371
  Names of each variable in the training dataset, `X`.
1372
  X_units : list[str] of length n_features
1373
  Units of each variable in the training dataset, `X`.
 
1423
  if weights is not None:
1424
  weights = check_array(weights, ensure_2d=False)
1425
  check_consistent_length(weights, y)
1426
+ X, y = self._validate_data_X_y(X, y)
1427
  self.feature_names_in_ = _safe_check_feature_names_in(
1428
  self, variable_names, generate_names=False
1429
  )
 
1433
  self.display_feature_names_in_ = np.array(
1434
  [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
1435
  )
1436
+ variable_names = self.feature_names_in_
1437
  else:
1438
  self.display_feature_names_in_ = self.feature_names_in_
1439
+ variable_names = self.feature_names_in_
 
1440
 
1441
  # Handle multioutput data
1442
  if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
 
1451
 
1452
  return X, y, Xresampled, weights, variable_names, X_units, y_units
1453
 
1454
+ def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1455
+ raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
1456
+ return cast(Tuple[ndarray, ndarray], raw_out)
1457
+
1458
+ def _validate_data_X(self, X) -> Tuple[ndarray]:
1459
+ raw_out = self._validate_data(X=X, reset=False) # type: ignore
1460
+ return cast(Tuple[ndarray], raw_out)
1461
+
1462
  def _pre_transform_training_data(
1463
+ self,
1464
+ X: ndarray,
1465
+ y: ndarray,
1466
+ Xresampled: Union[ndarray, None],
1467
+ variable_names: ArrayLike[str],
1468
+ X_units: Union[ArrayLike[str], None],
1469
+ y_units: Union[ArrayLike[str], str, None],
1470
+ random_state: np.random.RandomState,
1471
  ):
1472
  """
1473
  Transform the training data before fitting the symbolic regressor.
 
1476
 
1477
  Parameters
1478
  ----------
1479
+ X : ndarray
1480
  Training data of shape (n_samples, n_features).
1481
+ y : ndarray
1482
  Target values of shape (n_samples,) or (n_samples, n_targets).
1483
  Will be cast to X's dtype if necessary.
1484
+ Xresampled : ndarray | None
1485
  Resampled training data, of shape `(n_resampled, n_features)`,
1486
  used for denoising.
1487
  variable_names : list[str]
 
1519
  """
1520
  # Feature selection transformation
1521
  if self.select_k_features:
1522
+ selection_mask = run_feature_selection(
1523
  X, y, self.select_k_features, random_state=random_state
1524
  )
1525
+ X = X[:, selection_mask]
1526
 
1527
  if Xresampled is not None:
1528
+ Xresampled = Xresampled[:, selection_mask]
1529
 
1530
  # Reduce variable_names to selection
1531
+ variable_names = cast(
1532
+ ArrayLike[str],
1533
+ [
1534
+ variable_names[i]
1535
+ for i in range(len(variable_names))
1536
+ if selection_mask[i]
1537
+ ],
1538
+ )
1539
 
1540
  if X_units is not None:
1541
+ X_units = cast(
1542
+ ArrayLike[str],
1543
+ [X_units[i] for i in range(len(X_units)) if selection_mask[i]],
1544
+ )
1545
  self.X_units_ = copy.deepcopy(X_units)
1546
 
1547
  # Re-perform data validation and feature name updating
1548
+ X, y = self._validate_data_X_y(X, y)
1549
  # Update feature names with selected variable names
1550
+ self.selection_mask_ = selection_mask
1551
  self.feature_names_in_ = _check_feature_names_in(self, variable_names)
1552
  self.display_feature_names_in_ = self.feature_names_in_
1553
  print(f"Using features {self.feature_names_in_}")
 
1563
 
1564
  return X, y, variable_names, X_units, y_units
1565
 
1566
+ def _run(
1567
+ self,
1568
+ X: ndarray,
1569
+ y: ndarray,
1570
+ runtime_params: _DynamicallySetParams,
1571
+ weights: Optional[ndarray],
1572
+ seed: int,
1573
+ ):
1574
  """
1575
  Run the symbolic regression fitting process on the julia backend.
1576
 
1577
  Parameters
1578
  ----------
1579
+ X : ndarray
1580
  Training data of shape `(n_samples, n_features)`.
1581
+ y : ndarray
1582
  Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
1583
  Will be cast to `X`'s dtype if necessary.
1584
+ runtime_params : DynamicallySetParams
1585
+ Dynamically set versions of some parameters passed in __init__.
1586
+ weights : ndarray | None
1587
  Weight array of the same shape as `y`.
1588
  Each element is how to weight the mean-square-error loss
1589
  for that particular element of y.
 
1602
  """
1603
  # Need to be global as we don't want to recreate/reinstate julia for
1604
  # every new instance of PySRRegressor
1605
+ global ALREADY_RAN
1606
 
1607
  # These are the parameters which may be modified from the ones
1608
  # specified in init, so we define them here locally:
1609
+ binary_operators = runtime_params.binary_operators
1610
+ unary_operators = runtime_params.unary_operators
1611
+ maxdepth = runtime_params.maxdepth
1612
+ constraints = runtime_params.constraints
1613
+ multithreading = runtime_params.multithreading
1614
+ batch_size = runtime_params.batch_size
1615
+ update_verbosity = runtime_params.update_verbosity
1616
+ progress = runtime_params.progress
1617
+ warmup_maxsize_by = runtime_params.warmup_maxsize_by
1618
+
1619
  nested_constraints = self.nested_constraints
1620
  complexity_of_operators = self.complexity_of_operators
 
1621
  cluster_manager = self.cluster_manager
 
 
 
1622
 
1623
  # Start julia backend processes
1624
+ if not ALREADY_RAN and update_verbosity != 0:
1625
  print("Compiling Julia backend...")
1626
 
1627
  if cluster_manager is not None:
 
1660
  complexity_of_operators_str += f"({k}) => {v}, "
1661
  complexity_of_operators_str += ")"
1662
  complexity_of_operators = jl.seval(complexity_of_operators_str)
1663
+ # TODO: Refactor this into helper function
1664
 
1665
  custom_loss = jl.seval(
1666
  str(self.elementwise_loss)
 
1697
  optimize=self.weight_optimize,
1698
  )
1699
 
1700
+ jl_binary_operators: List[Any] = []
1701
+ jl_unary_operators: List[Any] = []
1702
+ for input_list, output_list, name in [
1703
+ (binary_operators, jl_binary_operators, "binary"),
1704
+ (unary_operators, jl_unary_operators, "unary"),
1705
+ ]:
1706
+ for op in input_list:
1707
+ jl_op = jl.seval(op)
1708
+ if not jl_is_function(jl_op):
1709
+ raise ValueError(
1710
+ f"When building `{name}_operators`, `'{op}'` did not return a Julia function"
1711
+ )
1712
+ output_list.append(jl_op)
1713
+
1714
  # Call to Julia backend.
1715
  # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
1716
  options = SymbolicRegression.Options(
1717
+ binary_operators=jl_array(jl_binary_operators, dtype=jl.Function),
1718
+ unary_operators=jl_array(jl_unary_operators, dtype=jl.Function),
1719
  bin_constraints=jl_array(bin_constraints),
1720
  una_constraints=jl_array(una_constraints),
1721
  complexity_of_operators=complexity_of_operators,
 
1747
  fraction_replaced_hof=self.fraction_replaced_hof,
1748
  should_simplify=self.should_simplify,
1749
  should_optimize_constants=self.should_optimize_constants,
1750
+ warmup_maxsize_by=warmup_maxsize_by,
 
 
1751
  use_frequency=self.use_frequency,
1752
  use_frequency_in_tournament=self.use_frequency_in_tournament,
1753
  adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
 
1854
  if self.delete_tempfiles:
1855
  shutil.rmtree(self.tempdir_)
1856
 
1857
+ ALREADY_RAN = True
1858
 
1859
  return self
1860
 
 
1864
  y,
1865
  Xresampled=None,
1866
  weights=None,
1867
+ variable_names: Optional[ArrayLike[str]] = None,
1868
+ X_units: Optional[ArrayLike[str]] = None,
1869
+ y_units: Optional[Union[str, ArrayLike[str]]] = None,
1870
  ) -> "PySRRegressor":
1871
  """
1872
  Search for equations to fit the dataset and store them in `self.equations_`.
 
1928
  self.X_units_ = None
1929
  self.y_units_ = None
1930
 
 
 
 
1931
  self._setup_equation_file()
1932
 
1933
+ runtime_params = self._validate_and_modify_params()
1934
 
1935
  (
1936
  X,
 
1955
  "More datapoints will lower the search speed."
1956
  )
1957
 
1958
+ random_state = check_random_state(self.random_state) # For np random
1959
+ seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
1960
+
1961
  # Pre transformations (feature selection and denoising)
1962
  X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
1963
  X, y, Xresampled, variable_names, X_units, y_units, random_state
 
1999
  self._checkpoint()
2000
 
2001
  # Perform the search:
2002
+ self._run(X, y, runtime_params, weights=weights, seed=seed)
2003
 
2004
  # Then, after fit, we save again, so the pickle file contains
2005
  # the equations:
 
2008
 
2009
  return self
2010
 
2011
+ def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
2012
  """
2013
  Update self.equations_ with any new options passed.
2014
 
 
2017
 
2018
  Parameters
2019
  ----------
2020
+ checkpoint_file : str or Path
2021
  Path to checkpoint hall of fame file to be loaded.
2022
  The default will use the set `equation_file_`.
2023
  """
2024
+ if checkpoint_file is not None:
2025
  self.equation_file_ = checkpoint_file
2026
  self.equation_file_contents_ = None
2027
  check_is_fitted(self, attributes=["equation_file_"])
 
2073
  if self.selection_mask_ is not None:
2074
  # RangeIndex enforces column order allowing columns to
2075
  # be correctly filtered with self.selection_mask_
2076
+ X = X[X.columns[self.selection_mask_]]
2077
  X.columns = self.feature_names_in_
2078
  # Without feature information, CallableEquation/lambda_format equations
2079
  # require that the column order of X matches that of the X used during
 
2083
  # reordered/reindexed to match those of the transformed (denoised and
2084
  # feature selected) X in fit.
2085
  X = X.reindex(columns=self.feature_names_in_)
2086
+ X = self._validate_data_X(X)
2087
 
2088
  try:
2089
+ if isinstance(best_equation, list):
2090
+ assert self.nout_ > 1
2091
  return np.stack(
2092
  [eq["lambda_format"](X) for eq in best_equation], axis=1
2093
  )
2094
+ else:
2095
+ return best_equation["lambda_format"](X)
2096
  except Exception as error:
2097
  raise ValueError(
2098
  "Failed to evaluate the expression. "
 
2122
  """
2123
  self.refresh()
2124
  best_equation = self.get_best(index=index)
2125
+ if isinstance(best_equation, list):
2126
+ assert self.nout_ > 1
2127
  return [eq["sympy_format"] for eq in best_equation]
2128
+ else:
2129
+ return best_equation["sympy_format"]
2130
 
2131
  def latex(self, index=None, precision=3):
2132
  """
 
2186
  self.set_params(output_jax_format=True)
2187
  self.refresh()
2188
  best_equation = self.get_best(index=index)
2189
+ if isinstance(best_equation, list):
2190
+ assert self.nout_ > 1
2191
  return [eq["jax_format"] for eq in best_equation]
2192
+ else:
2193
+ return best_equation["jax_format"]
2194
 
2195
  def pytorch(self, index=None):
2196
  """
 
2218
  self.set_params(output_torch_format=True)
2219
  self.refresh()
2220
  best_equation = self.get_best(index=index)
2221
+ if isinstance(best_equation, list):
2222
  return [eq["torch_format"] for eq in best_equation]
2223
+ else:
2224
+ return best_equation["torch_format"]
2225
 
2226
  def _read_equation_file(self):
2227
  """Read the hall of fame file created by `SymbolicRegression.jl`."""
 
2320
  lastComplexity = 0
2321
  sympy_format = []
2322
  lambda_format = []
2323
+ jax_format = []
2324
+ torch_format = []
 
 
2325
 
2326
  for _, eqn_row in output.iterrows():
2327
  eqn = pysr2sympy(
 
2433
  """
2434
  self.refresh()
2435
 
2436
+ if isinstance(self.equations_, list):
2437
  if indices is not None:
2438
  assert isinstance(indices, list)
2439
  assert isinstance(indices[0], list)
 
2442
  table_string = sympy2multilatextable(
2443
  self.equations_, indices=indices, precision=precision, columns=columns
2444
  )
2445
+ elif isinstance(self.equations_, pd.DataFrame):
2446
  if indices is not None:
2447
  assert isinstance(indices, list)
2448
  assert isinstance(indices[0], int)
 
2450
  table_string = sympy2latextable(
2451
  self.equations_, indices=indices, precision=precision, columns=columns
2452
  )
2453
+ else:
2454
+ raise ValueError(
2455
+ "Invalid type for equations_ to pass to `latex_table`. "
2456
+ "Expected a DataFrame or a list of DataFrames."
2457
+ )
2458
 
2459
+ return with_preamble(table_string)
 
 
 
 
 
 
 
2460
 
2461
 
2462
  def idx_model_selection(equations: pd.DataFrame, model_selection: str):
 
2474
  f"{model_selection} is not a valid model selection strategy."
2475
  )
2476
  return chosen_idx
2477
+
2478
+
2479
+ def _mutate_parameter(param_name: str, param_value):
2480
+ if param_name in ["binary_operators", "unary_operators"] and isinstance(
2481
+ param_value, str
2482
+ ):
2483
+ return [param_value]
2484
+
2485
+ if param_name == "batch_size" and param_value < 1:
2486
+ warnings.warn(
2487
+ "Given `batch_size` must be greater than or equal to one. "
2488
+ "`batch_size` has been increased to equal one."
2489
+ )
2490
+ return 1
2491
+
2492
+ if (
2493
+ param_name == "progress"
2494
+ and param_value == True
2495
+ and "buffer" not in sys.stdout.__dir__()
2496
+ ):
2497
+ warnings.warn(
2498
+ "Note: it looks like you are running in Jupyter. "
2499
+ "The progress bar will be turned off."
2500
+ )
2501
+ return False
2502
+
2503
+ return param_value
pysr/test/test.py CHANGED
@@ -431,6 +431,16 @@ class TestPipeline(unittest.TestCase):
431
  )
432
  np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
433
 
 
 
 
 
 
 
 
 
 
 
434
 
435
  def manually_create_model(equations, feature_names=None):
436
  if feature_names is None:
@@ -526,7 +536,7 @@ class TestFeatureSelection(unittest.TestCase):
526
  X = self.rstate.randn(20000, 5)
527
  y = X[:, 2] ** 2 + X[:, 3] ** 2
528
  selected = run_feature_selection(X, y, select_k_features=2)
529
- self.assertEqual(sorted(selected), [2, 3])
530
 
531
  def test_feature_selection_handler(self):
532
  X = self.rstate.randn(20000, 5)
@@ -538,8 +548,8 @@ class TestFeatureSelection(unittest.TestCase):
538
  variable_names=var_names,
539
  y=y,
540
  )
541
- self.assertTrue((2 in selection) and (3 in selection))
542
- selected_var_names = [var_names[i] for i in selection]
543
  self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
544
  np.testing.assert_array_equal(
545
  np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
 
431
  )
432
  np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
433
 
434
+ def test_jl_function_error(self):
435
+ # TODO: Move this to better class
436
+ with self.assertRaises(ValueError) as cm:
437
+ PySRRegressor(unary_operators=["1"]).fit([[1]], [1])
438
+
439
+ self.assertIn(
440
+ "When building `unary_operators`, `'1'` did not return a Julia function",
441
+ str(cm.exception),
442
+ )
443
+
444
 
445
  def manually_create_model(equations, feature_names=None):
446
  if feature_names is None:
 
536
  X = self.rstate.randn(20000, 5)
537
  y = X[:, 2] ** 2 + X[:, 3] ** 2
538
  selected = run_feature_selection(X, y, select_k_features=2)
539
+ np.testing.assert_array_equal(selected, [False, False, True, True, False])
540
 
541
  def test_feature_selection_handler(self):
542
  X = self.rstate.randn(20000, 5)
 
548
  variable_names=var_names,
549
  y=y,
550
  )
551
+ np.testing.assert_array_equal(selection, [False, False, True, True, False])
552
+ selected_var_names = [var_names[i] for i in range(5) if selection[i]]
553
  self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
554
  np.testing.assert_array_equal(
555
  np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
pysr/utils.py CHANGED
@@ -1,10 +1,18 @@
1
  import os
2
  import re
 
 
3
 
4
- from sklearn.utils.validation import _check_feature_names_in
 
5
 
 
6
 
7
- def _csv_filename_to_pkl_filename(csv_filename: str) -> str:
 
 
 
 
8
  if os.path.splitext(csv_filename)[1] == ".pkl":
9
  return csv_filename
10
 
 
1
  import os
2
  import re
3
+ from pathlib import Path
4
+ from typing import Any, List, TypeVar, Union
5
 
6
+ from numpy import ndarray
7
+ from sklearn.utils.validation import _check_feature_names_in # type: ignore
8
 
9
+ T = TypeVar("T", bound=Any)
10
 
11
+ ArrayLike = Union[ndarray, List[T]]
12
+ PathLike = Union[str, Path]
13
+
14
+
15
+ def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike:
16
  if os.path.splitext(csv_filename)[1] == ".pkl":
17
  return csv_filename
18
 
requirements.txt CHANGED
@@ -5,4 +5,3 @@ scikit_learn>=1.0.0,<2.0.0
5
  juliacall==0.9.20
6
  click>=7.0.0,<9.0.0
7
  setuptools>=50.0.0
8
- typing_extensions>=4.0.0,<5.0.0; python_version < "3.8"
 
5
  juliacall==0.9.20
6
  click>=7.0.0,<9.0.0
7
  setuptools>=50.0.0