MilesCranmer commited on
Commit
ca6dd96
2 Parent(s): 316393e 61635b9

Merge pull request #13 from MilesCranmer/feature_selection

Browse files
Files changed (1) hide show
  1. pysr/sr.py +34 -4
pysr/sr.py CHANGED
@@ -76,6 +76,7 @@ def pysr(X=None, y=None, weights=None,
76
  fast_cycle=False,
77
  maxdepth=None,
78
  variable_names=[],
 
79
  threads=None, #deprecated
80
  julia_optimization=3,
81
  ):
@@ -140,6 +141,10 @@ def pysr(X=None, y=None, weights=None,
140
  15% faster. May be algorithmically less efficient.
141
  :param variable_names: list, a list of names for the variables, other
142
  than "x0", "x1", etc.
 
 
 
 
143
  :param julia_optimization: int, Optimization level (0, 1, 2, 3)
144
  :returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
145
  (as strings).
@@ -154,6 +159,8 @@ def pysr(X=None, y=None, weights=None,
154
  variable_names = list(X.columns)
155
  X = np.array(X)
156
 
 
 
157
  # Check for potential errors before they happen
158
  assert len(unary_operators) + len(binary_operators) > 0
159
  assert len(X.shape) == 2
@@ -162,9 +169,17 @@ def pysr(X=None, y=None, weights=None,
162
  if weights is not None:
163
  assert len(weights.shape) == 1
164
  assert X.shape[0] == weights.shape[0]
165
- if len(variable_names) != 0:
166
  assert len(variable_names) == X.shape[1]
167
 
 
 
 
 
 
 
 
 
168
  if populations is None:
169
  populations = procs
170
 
@@ -233,7 +248,7 @@ const nrestarts = {nrestarts:d}
233
  const perturbationFactor = {perturbationFactor:f}f0
234
  const annealing = {"true" if annealing else "false"}
235
  const weighted = {"true" if weights is not None else "false"}
236
- const useVarMap = {"false" if len(variable_names) == 0 else "true"}
237
  const mutationWeights = [
238
  {weightMutateConstant:f},
239
  {weightMutateOperator:f},
@@ -260,7 +275,7 @@ const y = convert(Array{Float32, 1}, """f"{y_str})"
260
  def_datasets += """
261
  const weights = convert(Array{Float32, 1}, """f"{weight_str})"
262
 
263
- if len(variable_names) != 0:
264
  def_hyperparams += f"""
265
  const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
266
 
@@ -299,7 +314,7 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
299
  lastComplexity = 0
300
  sympy_format = []
301
  lambda_format = []
302
- if len(variable_names) != 0:
303
  sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
304
  else:
305
  sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
@@ -326,3 +341,18 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
326
  return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
327
 
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  fast_cycle=False,
77
  maxdepth=None,
78
  variable_names=[],
79
+ select_k_features=None,
80
  threads=None, #deprecated
81
  julia_optimization=3,
82
  ):
 
141
  15% faster. May be algorithmically less efficient.
142
  :param variable_names: list, a list of names for the variables, other
143
  than "x0", "x1", etc.
144
+ :param select_k_features: (None, int), whether to run feature selection in
145
+ Python using random forests, before passing to the symbolic regression
146
+ code. None means no feature selection; an int means select that many
147
+ features.
148
  :param julia_optimization: int, Optimization level (0, 1, 2, 3)
149
  :returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
150
  (as strings).
 
159
  variable_names = list(X.columns)
160
  X = np.array(X)
161
 
162
+ use_custom_variable_names = (len(variable_names) != 0)
163
+
164
  # Check for potential errors before they happen
165
  assert len(unary_operators) + len(binary_operators) > 0
166
  assert len(X.shape) == 2
 
169
  if weights is not None:
170
  assert len(weights.shape) == 1
171
  assert X.shape[0] == weights.shape[0]
172
+ if use_custom_variable_names:
173
  assert len(variable_names) == X.shape[1]
174
 
175
+ if select_k_features is not None:
176
+ selection = run_feature_selection(X, y, select_k_features)
177
+ print(f"Using features {selection}")
178
+ X = X[:, selection]
179
+
180
+ if use_custom_variable_names:
181
+ variable_names = variable_names[selection]
182
+
183
  if populations is None:
184
  populations = procs
185
 
 
248
  const perturbationFactor = {perturbationFactor:f}f0
249
  const annealing = {"true" if annealing else "false"}
250
  const weighted = {"true" if weights is not None else "false"}
251
+ const useVarMap = {"true" if use_custom_variable_names else "false"}
252
  const mutationWeights = [
253
  {weightMutateConstant:f},
254
  {weightMutateOperator:f},
 
275
  def_datasets += """
276
  const weights = convert(Array{Float32, 1}, """f"{weight_str})"
277
 
278
+ if use_custom_variable_names:
279
  def_hyperparams += f"""
280
  const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
281
 
 
314
  lastComplexity = 0
315
  sympy_format = []
316
  lambda_format = []
317
+ if use_custom_variable_names:
318
  sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
319
  else:
320
  sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
 
341
  return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
342
 
343
 
344
+ def run_feature_selection(X, y, select_k_features):
345
+ """Use a gradient boosting tree regressor as a proxy for finding
346
+ the k most important features in X, returning indices for those
347
+ features as output."""
348
+
349
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
350
+ from sklearn.feature_selection import SelectFromModel, SelectKBest
351
+
352
+ clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
353
+ clf.fit(X, y)
354
+ selector = SelectFromModel(clf, threshold=-np.inf,
355
+ max_features=select_k_features, prefit=True)
356
+ return selector.get_support(indices=True)
357
+
358
+