MilesCranmer commited on
Commit
b242a62
·
unverified ·
2 Parent(s): 5620b3a 0b9e421

Merge pull request #428 from MilesCranmer/refactor-utils

Browse files
.github/workflows/CI.yml CHANGED
@@ -143,3 +143,29 @@ jobs:
143
  run: |
144
  pip install coveralls
145
  coveralls --finish
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  run: |
144
  pip install coveralls
145
  coveralls --finish
146
+
147
+ types:
148
+ name: Check types
149
+ runs-on: ubuntu-latest
150
+ defaults:
151
+ run:
152
+ shell: bash -l {0}
153
+ strategy:
154
+ matrix:
155
+ python-version: ['3.10']
156
+
157
+ steps:
158
+ - uses: actions/checkout@v3
159
+ - name: "Set up Python"
160
+ uses: actions/setup-python@v4
161
+ with:
162
+ python-version: ${{ matrix.python-version }}
163
+ cache: pip
164
+ - name: "Install PySR and all dependencies"
165
+ run: |
166
+ python -m pip install --upgrade pip
167
+ pip install -r requirements.txt
168
+ pip install mypy jax jaxlib torch
169
+ python setup.py install
170
+ - name: "Run mypy"
171
+ run: mypy --install-types --non-interactive pysr
mypy.ini ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [mypy]
2
+ warn_return_any = True
3
+
4
+ [mypy-sklearn.*]
5
+ ignore_missing_imports = True
6
+
7
+ [mypy-julia.*]
8
+ ignore_missing_imports = True
pysr/__init__.py CHANGED
@@ -1,9 +1,10 @@
1
  from . import sklearn_monkeypatch
 
2
  from .export_jax import sympy2jax
3
  from .export_torch import sympy2torch
4
  from .feynman_problems import FeynmanProblem, Problem
5
  from .julia_helpers import install
6
- from .sr import PySRRegressor, best, best_callable, best_row, best_tex, pysr
7
  from .version import __version__
8
 
9
  __all__ = [
 
1
  from . import sklearn_monkeypatch
2
+ from .deprecated import best, best_callable, best_row, best_tex, pysr
3
  from .export_jax import sympy2jax
4
  from .export_torch import sympy2torch
5
  from .feynman_problems import FeynmanProblem, Problem
6
  from .julia_helpers import install
7
+ from .sr import PySRRegressor
8
  from .version import __version__
9
 
10
  __all__ = [
pysr/denoising.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Functions for denoising data during preprocessing."""
2
+ import numpy as np
3
+
4
+
5
+ def denoise(X, y, Xresampled=None, random_state=None):
6
+ """Denoise the dataset using a Gaussian process."""
7
+ from sklearn.gaussian_process import GaussianProcessRegressor
8
+ from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
9
+
10
+ gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
11
+ gpr = GaussianProcessRegressor(
12
+ kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
13
+ )
14
+ gpr.fit(X, y)
15
+
16
+ if Xresampled is not None:
17
+ return Xresampled, gpr.predict(Xresampled)
18
+
19
+ return X, gpr.predict(X)
20
+
21
+
22
+ def multi_denoise(X, y, Xresampled=None, random_state=None):
23
+ """Perform `denoise` along each column of `y` independently."""
24
+ y = np.stack(
25
+ [
26
+ denoise(X, y[:, i], Xresampled=Xresampled, random_state=random_state)[1]
27
+ for i in range(y.shape[1])
28
+ ],
29
+ axis=1,
30
+ )
31
+
32
+ if Xresampled is not None:
33
+ return Xresampled, y
34
+
35
+ return X, y
pysr/deprecated.py CHANGED
@@ -1,4 +1,58 @@
1
  """Various functions to deprecate features."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def make_deprecated_kwargs_for_pysr_regressor():
 
1
  """Various functions to deprecate features."""
2
+ import warnings
3
+
4
+
5
+ def pysr(X, y, weights=None, **kwargs): # pragma: no cover
6
+ from .sr import PySRRegressor
7
+
8
+ warnings.warn(
9
+ "Calling `pysr` is deprecated. "
10
+ "Please use `model = PySRRegressor(**params); "
11
+ "model.fit(X, y)` going forward.",
12
+ FutureWarning,
13
+ )
14
+ model = PySRRegressor(**kwargs)
15
+ model.fit(X, y, weights=weights)
16
+ return model.equations_
17
+
18
+
19
+ def best(*args, **kwargs): # pragma: no cover
20
+ raise NotImplementedError(
21
+ "`best` has been deprecated. "
22
+ "Please use the `PySRRegressor` interface. "
23
+ "After fitting, you can return `.sympy()` "
24
+ "to get the sympy representation "
25
+ "of the best equation."
26
+ )
27
+
28
+
29
+ def best_row(*args, **kwargs): # pragma: no cover
30
+ raise NotImplementedError(
31
+ "`best_row` has been deprecated. "
32
+ "Please use the `PySRRegressor` interface. "
33
+ "After fitting, you can run `print(model)` to view the best equation, "
34
+ "or "
35
+ "`model.get_best()` to return the best equation's "
36
+ "row in `model.equations_`."
37
+ )
38
+
39
+
40
+ def best_tex(*args, **kwargs): # pragma: no cover
41
+ raise NotImplementedError(
42
+ "`best_tex` has been deprecated. "
43
+ "Please use the `PySRRegressor` interface. "
44
+ "After fitting, you can return `.latex()` to "
45
+ "get the sympy representation "
46
+ "of the best equation."
47
+ )
48
+
49
+
50
+ def best_callable(*args, **kwargs): # pragma: no cover
51
+ raise NotImplementedError(
52
+ "`best_callable` has been deprecated. Please use the `PySRRegressor` "
53
+ "interface. After fitting, you can use "
54
+ "`.predict(X)` to use the best callable."
55
+ )
56
 
57
 
58
  def make_deprecated_kwargs_for_pysr_regressor():
pysr/export_latex.py CHANGED
@@ -1,5 +1,5 @@
1
  """Functions to help export PySR equations to LaTeX."""
2
- from typing import List
3
 
4
  import pandas as pd
5
  import sympy
@@ -19,14 +19,16 @@ class PreciseLatexPrinter(LatexPrinter):
19
  return super()._print_Float(reduced_float)
20
 
21
 
22
- def sympy2latex(expr, prec=3, full_prec=True, **settings):
23
  """Convert sympy expression to LaTeX with custom precision."""
24
  settings["full_prec"] = full_prec
25
  printer = PreciseLatexPrinter(settings=settings, prec=prec)
26
  return printer.doprint(expr)
27
 
28
 
29
- def generate_table_environment(columns=["equation", "complexity", "loss"]):
 
 
30
  margins = "c" * len(columns)
31
  column_map = {
32
  "complexity": "Complexity",
@@ -58,12 +60,12 @@ def generate_table_environment(columns=["equation", "complexity", "loss"]):
58
 
59
  def sympy2latextable(
60
  equations: pd.DataFrame,
61
- indices: List[int] = None,
62
  precision: int = 3,
63
- columns=["equation", "complexity", "loss", "score"],
64
  max_equation_length: int = 50,
65
  output_variable_name: str = "y",
66
- ):
67
  """Generate a booktabs-style LaTeX table for a single set of equations."""
68
  assert isinstance(equations, pd.DataFrame)
69
 
@@ -71,7 +73,7 @@ def sympy2latextable(
71
  latex_table_content = []
72
 
73
  if indices is None:
74
- indices = range(len(equations))
75
 
76
  for i in indices:
77
  latex_equation = sympy2latex(
@@ -126,11 +128,11 @@ def sympy2latextable(
126
 
127
  def sympy2multilatextable(
128
  equations: List[pd.DataFrame],
129
- indices: List[List[int]] = None,
130
  precision: int = 3,
131
- columns=["equation", "complexity", "loss", "score"],
132
- output_variable_names: str = None,
133
- ):
134
  """Generate multiple latex tables for a list of equation sets."""
135
  # TODO: Let user specify custom output variable
136
 
 
1
  """Functions to help export PySR equations to LaTeX."""
2
+ from typing import List, Optional, Tuple
3
 
4
  import pandas as pd
5
  import sympy
 
19
  return super()._print_Float(reduced_float)
20
 
21
 
22
+ def sympy2latex(expr, prec=3, full_prec=True, **settings) -> str:
23
  """Convert sympy expression to LaTeX with custom precision."""
24
  settings["full_prec"] = full_prec
25
  printer = PreciseLatexPrinter(settings=settings, prec=prec)
26
  return printer.doprint(expr)
27
 
28
 
29
+ def generate_table_environment(
30
+ columns: List[str] = ["equation", "complexity", "loss"]
31
+ ) -> Tuple[str, str]:
32
  margins = "c" * len(columns)
33
  column_map = {
34
  "complexity": "Complexity",
 
60
 
61
  def sympy2latextable(
62
  equations: pd.DataFrame,
63
+ indices: Optional[List[int]] = None,
64
  precision: int = 3,
65
+ columns: List[str] = ["equation", "complexity", "loss", "score"],
66
  max_equation_length: int = 50,
67
  output_variable_name: str = "y",
68
+ ) -> str:
69
  """Generate a booktabs-style LaTeX table for a single set of equations."""
70
  assert isinstance(equations, pd.DataFrame)
71
 
 
73
  latex_table_content = []
74
 
75
  if indices is None:
76
+ indices = list(equations.index)
77
 
78
  for i in indices:
79
  latex_equation = sympy2latex(
 
128
 
129
  def sympy2multilatextable(
130
  equations: List[pd.DataFrame],
131
+ indices: Optional[List[List[int]]] = None,
132
  precision: int = 3,
133
+ columns: List[str] = ["equation", "complexity", "loss", "score"],
134
+ output_variable_names: Optional[List[str]] = None,
135
+ ) -> str:
136
  """Generate multiple latex tables for a list of equation sets."""
137
  # TODO: Let user specify custom output variable
138
 
pysr/export_sympy.py CHANGED
@@ -51,14 +51,14 @@ sympy_mappings = {
51
 
52
 
53
  def create_sympy_symbols(
54
- feature_names_in: Optional[List[str]] = None,
55
  ) -> List[sympy.Symbol]:
56
  return [sympy.Symbol(variable) for variable in feature_names_in]
57
 
58
 
59
  def pysr2sympy(
60
  equation: str, *, extra_sympy_mappings: Optional[Dict[str, Callable]] = None
61
- ) -> sympy.Expr:
62
  local_sympy_mappings = {
63
  **(extra_sympy_mappings if extra_sympy_mappings else {}),
64
  **sympy_mappings,
 
51
 
52
 
53
  def create_sympy_symbols(
54
+ feature_names_in: List[str],
55
  ) -> List[sympy.Symbol]:
56
  return [sympy.Symbol(variable) for variable in feature_names_in]
57
 
58
 
59
  def pysr2sympy(
60
  equation: str, *, extra_sympy_mappings: Optional[Dict[str, Callable]] = None
61
+ ):
62
  local_sympy_mappings = {
63
  **(extra_sympy_mappings if extra_sympy_mappings else {}),
64
  **sympy_mappings,
pysr/feature_selection.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Functions for doing feature selection during preprocessing."""
2
+ import numpy as np
3
+
4
+
5
+ def run_feature_selection(X, y, select_k_features, random_state=None):
6
+ """
7
+ Find most important features.
8
+
9
+ Uses a gradient boosting tree regressor as a proxy for finding
10
+ the k most important features in X, returning indices for those
11
+ features as output.
12
+ """
13
+ from sklearn.ensemble import RandomForestRegressor
14
+ from sklearn.feature_selection import SelectFromModel
15
+
16
+ clf = RandomForestRegressor(
17
+ n_estimators=100, max_depth=3, random_state=random_state
18
+ )
19
+ clf.fit(X, y)
20
+ selector = SelectFromModel(
21
+ clf, threshold=-np.inf, max_features=select_k_features, prefit=True
22
+ )
23
+ return selector.get_support(indices=True)
24
+
25
+
26
+ # Function has not been removed only due to usage in module tests
27
+ def _handle_feature_selection(X, select_k_features, y, variable_names):
28
+ if select_k_features is not None:
29
+ selection = run_feature_selection(X, y, select_k_features)
30
+ print(f"Using features {[variable_names[i] for i in selection]}")
31
+ X = X[:, selection]
32
+ else:
33
+ selection = None
34
+
35
+ return X, selection
pysr/feynman_problems.py CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
4
 
5
  import numpy as np
6
 
7
- from .sr import best, pysr
8
 
9
  PKG_DIR = Path(__file__).parents[1]
10
  FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
 
4
 
5
  import numpy as np
6
 
7
+ from .deprecated import best, pysr
8
 
9
  PKG_DIR = Path(__file__).parents[1]
10
  FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
pysr/sr.py CHANGED
@@ -11,6 +11,7 @@ from datetime import datetime
11
  from io import StringIO
12
  from multiprocessing import cpu_count
13
  from pathlib import Path
 
14
 
15
  import numpy as np
16
  import pandas as pd
@@ -18,12 +19,14 @@ from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
18
  from sklearn.utils import check_array, check_consistent_length, check_random_state
19
  from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
20
 
 
21
  from .deprecated import make_deprecated_kwargs_for_pysr_regressor
22
  from .export_jax import sympy2jax
23
  from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
24
  from .export_numpy import sympy2numpy
25
  from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
26
  from .export_torch import sympy2torch
 
27
  from .julia_helpers import (
28
  _escape_filename,
29
  _load_backend,
@@ -33,23 +36,18 @@ from .julia_helpers import (
33
  init_julia,
34
  is_julia_version_greater_eq,
35
  )
 
 
 
 
 
 
36
 
37
  Main = None # TODO: Rename to more descriptive name like "julia_runtime"
38
 
39
  already_ran = False
40
 
41
 
42
- def pysr(X, y, weights=None, **kwargs): # pragma: no cover
43
- warnings.warn(
44
- "Calling `pysr` is deprecated. "
45
- "Please use `model = PySRRegressor(**params); model.fit(X, y)` going forward.",
46
- FutureWarning,
47
- )
48
- model = PySRRegressor(**kwargs)
49
- model.fit(X, y, weights=weights)
50
- return model.equations_
51
-
52
-
53
  def _process_constraints(binary_operators, unary_operators, constraints):
54
  constraints = constraints.copy()
55
  for op in unary_operators:
@@ -172,37 +170,6 @@ def _check_assertions(
172
  )
173
 
174
 
175
- def best(*args, **kwargs): # pragma: no cover
176
- raise NotImplementedError(
177
- "`best` has been deprecated. Please use the `PySRRegressor` interface. "
178
- "After fitting, you can return `.sympy()` to get the sympy representation "
179
- "of the best equation."
180
- )
181
-
182
-
183
- def best_row(*args, **kwargs): # pragma: no cover
184
- raise NotImplementedError(
185
- "`best_row` has been deprecated. Please use the `PySRRegressor` interface. "
186
- "After fitting, you can run `print(model)` to view the best equation, or "
187
- "`model.get_best()` to return the best equation's row in `model.equations_`."
188
- )
189
-
190
-
191
- def best_tex(*args, **kwargs): # pragma: no cover
192
- raise NotImplementedError(
193
- "`best_tex` has been deprecated. Please use the `PySRRegressor` interface. "
194
- "After fitting, you can return `.latex()` to get the sympy representation "
195
- "of the best equation."
196
- )
197
-
198
-
199
- def best_callable(*args, **kwargs): # pragma: no cover
200
- raise NotImplementedError(
201
- "`best_callable` has been deprecated. Please use the `PySRRegressor` "
202
- "interface. After fitting, you can use `.predict(X)` to use the best callable."
203
- )
204
-
205
-
206
  # Class validation constants
207
  VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
208
 
@@ -945,10 +912,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
945
  model : PySRRegressor
946
  The model with fitted equations.
947
  """
948
- if os.path.splitext(equation_file)[1] != ".pkl":
949
- pkl_filename = _csv_filename_to_pkl_filename(equation_file)
950
- else:
951
- pkl_filename = equation_file
952
 
953
  # Try to load model from <equation_file>.pkl
954
  print(f"Checking if {pkl_filename} exists...")
@@ -1502,19 +1467,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1502
  # Denoising transformation
1503
  if self.denoise:
1504
  if self.nout_ > 1:
1505
- y = np.stack(
1506
- [
1507
- _denoise(
1508
- X, y[:, i], Xresampled=Xresampled, random_state=random_state
1509
- )[1]
1510
- for i in range(self.nout_)
1511
- ],
1512
- axis=1,
1513
  )
1514
- if Xresampled is not None:
1515
- X = Xresampled
1516
  else:
1517
- X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1518
 
1519
  return X, y, variable_names, X_units, y_units
1520
 
@@ -1783,10 +1740,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1783
  y,
1784
  Xresampled=None,
1785
  weights=None,
1786
- variable_names=None,
1787
- X_units=None,
1788
- y_units=None,
1789
- ):
1790
  """
1791
  Search for equations to fit the dataset and store them in `self.equations_`.
1792
 
@@ -2373,7 +2330,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2373
  return "\n".join(preamble_string + [table_string])
2374
 
2375
 
2376
- def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
2377
  """Select an expression and return its index."""
2378
  if model_selection == "accuracy":
2379
  chosen_idx = equations["loss"].idxmin()
@@ -2388,100 +2345,3 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
2388
  f"{model_selection} is not a valid model selection strategy."
2389
  )
2390
  return chosen_idx
2391
-
2392
-
2393
- def _denoise(X, y, Xresampled=None, random_state=None):
2394
- """Denoise the dataset using a Gaussian process."""
2395
- from sklearn.gaussian_process import GaussianProcessRegressor
2396
- from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
2397
-
2398
- gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
2399
- gpr = GaussianProcessRegressor(
2400
- kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
2401
- )
2402
- gpr.fit(X, y)
2403
- if Xresampled is not None:
2404
- return Xresampled, gpr.predict(Xresampled)
2405
-
2406
- return X, gpr.predict(X)
2407
-
2408
-
2409
- # Function has not been removed only due to usage in module tests
2410
- def _handle_feature_selection(X, select_k_features, y, variable_names):
2411
- if select_k_features is not None:
2412
- selection = run_feature_selection(X, y, select_k_features)
2413
- print(f"Using features {[variable_names[i] for i in selection]}")
2414
- X = X[:, selection]
2415
-
2416
- else:
2417
- selection = None
2418
- return X, selection
2419
-
2420
-
2421
- def run_feature_selection(X, y, select_k_features, random_state=None):
2422
- """
2423
- Find most important features.
2424
-
2425
- Uses a gradient boosting tree regressor as a proxy for finding
2426
- the k most important features in X, returning indices for those
2427
- features as output.
2428
- """
2429
- from sklearn.ensemble import RandomForestRegressor
2430
- from sklearn.feature_selection import SelectFromModel
2431
-
2432
- clf = RandomForestRegressor(
2433
- n_estimators=100, max_depth=3, random_state=random_state
2434
- )
2435
- clf.fit(X, y)
2436
- selector = SelectFromModel(
2437
- clf, threshold=-np.inf, max_features=select_k_features, prefit=True
2438
- )
2439
- return selector.get_support(indices=True)
2440
-
2441
-
2442
- def _csv_filename_to_pkl_filename(csv_filename) -> str:
2443
- # Assume that the csv filename is of the form "foo.csv"
2444
- assert str(csv_filename).endswith(".csv")
2445
-
2446
- dirname = str(os.path.dirname(csv_filename))
2447
- basename = str(os.path.basename(csv_filename))
2448
- base = str(os.path.splitext(basename)[0])
2449
-
2450
- pkl_basename = base + ".pkl"
2451
-
2452
- return os.path.join(dirname, pkl_basename)
2453
-
2454
-
2455
- _regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
2456
- _regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
2457
- _regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
2458
-
2459
- _apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x)
2460
- _apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x)
2461
- _apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x)
2462
-
2463
-
2464
- def _preprocess_julia_floats(s: str) -> str:
2465
- if isinstance(s, str):
2466
- s = _apply_regexp_im(s)
2467
- s = _apply_regexp_im_sci(s)
2468
- s = _apply_regexp_sci(s)
2469
- return s
2470
-
2471
-
2472
- def _subscriptify(i: int) -> str:
2473
- """Converts integer to subscript text form.
2474
-
2475
- For example, 123 -> "₁₂₃".
2476
- """
2477
- return "".join([chr(0x2080 + int(c)) for c in str(i)])
2478
-
2479
-
2480
- def _safe_check_feature_names_in(self, variable_names, generate_names=True):
2481
- """_check_feature_names_in with compat for old versions."""
2482
- try:
2483
- return _check_feature_names_in(
2484
- self, variable_names, generate_names=generate_names
2485
- )
2486
- except TypeError:
2487
- return _check_feature_names_in(self, variable_names)
 
11
  from io import StringIO
12
  from multiprocessing import cpu_count
13
  from pathlib import Path
14
+ from typing import List, Optional
15
 
16
  import numpy as np
17
  import pandas as pd
 
19
  from sklearn.utils import check_array, check_consistent_length, check_random_state
20
  from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
21
 
22
+ from .denoising import denoise, multi_denoise
23
  from .deprecated import make_deprecated_kwargs_for_pysr_regressor
24
  from .export_jax import sympy2jax
25
  from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
26
  from .export_numpy import sympy2numpy
27
  from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
28
  from .export_torch import sympy2torch
29
+ from .feature_selection import run_feature_selection
30
  from .julia_helpers import (
31
  _escape_filename,
32
  _load_backend,
 
36
  init_julia,
37
  is_julia_version_greater_eq,
38
  )
39
+ from .utils import (
40
+ _csv_filename_to_pkl_filename,
41
+ _preprocess_julia_floats,
42
+ _safe_check_feature_names_in,
43
+ _subscriptify,
44
+ )
45
 
46
  Main = None # TODO: Rename to more descriptive name like "julia_runtime"
47
 
48
  already_ran = False
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
51
  def _process_constraints(binary_operators, unary_operators, constraints):
52
  constraints = constraints.copy()
53
  for op in unary_operators:
 
170
  )
171
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  # Class validation constants
174
  VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
175
 
 
912
  model : PySRRegressor
913
  The model with fitted equations.
914
  """
915
+
916
+ pkl_filename = _csv_filename_to_pkl_filename(equation_file)
 
 
917
 
918
  # Try to load model from <equation_file>.pkl
919
  print(f"Checking if {pkl_filename} exists...")
 
1467
  # Denoising transformation
1468
  if self.denoise:
1469
  if self.nout_ > 1:
1470
+ X, y = multi_denoise(
1471
+ X, y, Xresampled=Xresampled, random_state=random_state
 
 
 
 
 
 
1472
  )
 
 
1473
  else:
1474
+ X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1475
 
1476
  return X, y, variable_names, X_units, y_units
1477
 
 
1740
  y,
1741
  Xresampled=None,
1742
  weights=None,
1743
+ variable_names: Optional[List[str]] = None,
1744
+ X_units: Optional[List[str]] = None,
1745
+ y_units: Optional[List[str]] = None,
1746
+ ) -> "PySRRegressor":
1747
  """
1748
  Search for equations to fit the dataset and store them in `self.equations_`.
1749
 
 
2330
  return "\n".join(preamble_string + [table_string])
2331
 
2332
 
2333
+ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
2334
  """Select an expression and return its index."""
2335
  if model_selection == "accuracy":
2336
  chosen_idx = equations["loss"].idxmin()
 
2345
  f"{model_selection} is not a valid model selection strategy."
2346
  )
2347
  return chosen_idx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pysr/test/test.py CHANGED
@@ -14,14 +14,9 @@ from sklearn.utils.estimator_checks import check_estimator
14
 
15
  from .. import PySRRegressor, julia_helpers
16
  from ..export_latex import sympy2latex
17
- from ..sr import (
18
- _check_assertions,
19
- _csv_filename_to_pkl_filename,
20
- _handle_feature_selection,
21
- _process_constraints,
22
- idx_model_selection,
23
- run_feature_selection,
24
- )
25
 
26
  DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
27
  DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
 
14
 
15
  from .. import PySRRegressor, julia_helpers
16
  from ..export_latex import sympy2latex
17
+ from ..feature_selection import _handle_feature_selection, run_feature_selection
18
+ from ..sr import _check_assertions, _process_constraints, idx_model_selection
19
+ from ..utils import _csv_filename_to_pkl_filename
 
 
 
 
 
20
 
21
  DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
22
  DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
pysr/utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ from sklearn.utils.validation import _check_feature_names_in
5
+
6
+
7
+ def _csv_filename_to_pkl_filename(csv_filename: str) -> str:
8
+ if os.path.splitext(csv_filename)[1] == ".pkl":
9
+ return csv_filename
10
+
11
+ # Assume that the csv filename is of the form "foo.csv"
12
+ assert str(csv_filename).endswith(".csv")
13
+
14
+ dirname = str(os.path.dirname(csv_filename))
15
+ basename = str(os.path.basename(csv_filename))
16
+ base = str(os.path.splitext(basename)[0])
17
+
18
+ pkl_basename = base + ".pkl"
19
+
20
+ return os.path.join(dirname, pkl_basename)
21
+
22
+
23
+ _regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
24
+ _regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
25
+ _regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
26
+
27
+ _apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x)
28
+ _apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x)
29
+ _apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x)
30
+
31
+
32
+ def _preprocess_julia_floats(s: str) -> str:
33
+ if isinstance(s, str):
34
+ s = _apply_regexp_im(s)
35
+ s = _apply_regexp_im_sci(s)
36
+ s = _apply_regexp_sci(s)
37
+ return s
38
+
39
+
40
+ def _safe_check_feature_names_in(self, variable_names, generate_names=True):
41
+ """_check_feature_names_in with compat for old versions."""
42
+ try:
43
+ return _check_feature_names_in(
44
+ self, variable_names, generate_names=generate_names
45
+ )
46
+ except TypeError:
47
+ return _check_feature_names_in(self, variable_names)
48
+
49
+
50
+ def _subscriptify(i: int) -> str:
51
+ """Converts integer to subscript text form.
52
+
53
+ For example, 123 -> "₁₂₃".
54
+ """
55
+ return "".join([chr(0x2080 + int(c)) for c in str(i)])