File size: 4,164 Bytes
cd54791
d26d668
0bf77e2
d26d668
cd54791
 
d26d668
cd54791
 
 
 
 
 
 
 
 
 
 
 
 
 
d26d668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9905c12
d26d668
 
 
 
 
 
 
cd54791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bf77e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd54791
 
 
4582e28
0bf77e2
 
cd54791
 
 
 
 
 
 
 
0bf77e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from pysr import pysr, best_row
from sklearn.base import BaseEstimator, RegressorMixin
import inspect
import pandas as pd


class PySRRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model_selection="accuracy", **params):
        """Initialize settings for pysr.pysr call.

        :param model_selection: How to select a model. Can be 'accuracy' or 'best'. 'best' will optimize a combination of complexity and accuracy.
        :type model_selection: str
        """
        super().__init__()
        self.model_selection = model_selection
        self.params = params

        # Stored equations:
        self.equations = None

    def __repr__(self):
        if self.equations is None:
            return "PySRRegressor.equations=None"

        equations = self.equations
        selected = ["" for _ in range(len(equations))]
        if self.model_selection == "accuracy":
            chosen_row = -1
        elif self.model_selection == "best":
            chosen_row = equations["score"].idxmax()
        else:
            raise NotImplementedError
        selected[chosen_row] = ">"
        output = "PySRRegressor.equations=[\n"
        repr_equations = pd.DataFrame(
            dict(
                selected=selected,
                score=equations["score"],
                Equation=equations["Equation"],
                MSE=equations["MSE"],
                Complexity=equations["Complexity"],
            )
        )
        output += repr_equations.__repr__()
        output += "\n]"
        return output

    def set_params(self, **params):
        """Set parameters for pysr.pysr call or model_selection strategy."""
        for key, value in params.items():
            if key == "model_selection":
                self.model_selection = value
            self.params[key] = value

        return self

    def get_params(self, deep=True):
        del deep
        return {**self.params, "model_selection": self.model_selection}

    def get_best(self):
        if self.equations is None:
            return 0.0
        if self.model_selection == "accuracy":
            return self.equations.iloc[-1]
        elif self.model_selection == "best":
            return best_row(self.equations)
        else:
            raise NotImplementedError

    def fit(self, X, y, weights=None, variable_names=None):
        """Search for equations to fit the dataset.

        :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
        :type X: np.ndarray/pandas.DataFrame
        :param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y.
        :type y: np.ndarray
        :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
        :type weights: np.ndarray
        :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
        :type variable_names: list
        """
        if variable_names is None:
            if "variable_names" in self.params:
                variable_names = self.params["variable_names"]

        self.equations = pysr(
            X=X,
            y=y,
            weights=weights,
            variable_names=variable_names,
            **{k: v for k, v in self.params.items() if k != "variable_names"},
        )
        return self

    def predict(self, X):
        equation_row = self.get_best()
        np_format = equation_row["lambda_format"]

        return np_format(X)


# Add the docs from pysr() to PySRRegressor():

_pysr_docstring_split = []
_start_recording = False
for line in inspect.getdoc(pysr).split("\n"):
    # Skip docs on "X" and "y"
    if ":param binary_operators:" in line:
        _start_recording = True
    if ":returns:" in line:
        _start_recording = False
    if _start_recording:
        _pysr_docstring_split.append(line)
_pysr_docstring = "\n\t".join(_pysr_docstring_split)

PySRRegressor.__init__.__doc__ += _pysr_docstring