sgbaird commited on
Commit
45b3fc1
1 Parent(s): 58815da

restore versions that got overwritten/deleted

Browse files
Files changed (2) hide show
  1. app.py +120 -12
  2. surrogate.py +120 -67
app.py CHANGED
@@ -1,15 +1,19 @@
 
1
  import gradio as gr
2
- from surrogate import CrabNetSurrogateModel
 
 
3
 
4
  model = CrabNetSurrogateModel()
5
 
6
- example_parameterization = parameterization = {
 
7
  "N": 3,
8
  "alpha": 0.5,
9
  "d_model": 512,
10
  "dim_feedforward": 2048,
11
  "dropout": 0.1,
12
- "emb_scaler": 1.0,
13
  "epochs_step": 10,
14
  "eps": 0.000001,
15
  "fudge": 0.02,
@@ -18,26 +22,130 @@ example_parameterization = parameterization = {
18
  "lr": 0.001,
19
  "pe_resolution": 5000,
20
  "ple_resolution": 5000,
21
- "pos_scaler": 1.0,
22
  "weight_decay": 0,
23
  "batch_size": 32,
24
  "out_hidden4": 128,
25
- "betas2": 0.9,
26
- "betas1": 0.999,
27
- "losscurve": False,
28
- "learningcurve": False,
29
  "bias": False,
30
  "criterion": "RobustL1",
31
  "elem_prop": "mat2vec",
32
  "train_frac": 0.5,
33
  }
34
 
35
- model.surrogate_evaluate(example_parameterization)
 
 
36
 
37
 
38
- def greet(name):
39
- return "Hello " + name + "!!"
 
40
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  iface.launch()
 
1
+ import numpy as np
2
  import gradio as gr
3
+ import pandas as pd
4
+ from sklearn.preprocessing import MinMaxScaler
5
+ from surrogate import CrabNetSurrogateModel, PARAM_BOUNDS
6
 
7
  model = CrabNetSurrogateModel()
8
 
9
+ # Define the input parameters
10
+ example_parameterization = {
11
  "N": 3,
12
  "alpha": 0.5,
13
  "d_model": 512,
14
  "dim_feedforward": 2048,
15
  "dropout": 0.1,
16
+ "emb_scaler": 0.5,
17
  "epochs_step": 10,
18
  "eps": 0.000001,
19
  "fudge": 0.02,
 
22
  "lr": 0.001,
23
  "pe_resolution": 5000,
24
  "ple_resolution": 5000,
25
+ "pos_scaler": 0.5,
26
  "weight_decay": 0,
27
  "batch_size": 32,
28
  "out_hidden4": 128,
29
+ "betas1": 0.9,
30
+ "betas2": 0.999,
 
 
31
  "bias": False,
32
  "criterion": "RobustL1",
33
  "elem_prop": "mat2vec",
34
  "train_frac": 0.5,
35
  }
36
 
37
+ # Define the output parameters
38
+ example_results = model.surrogate_evaluate([example_parameterization])
39
+ example_result = example_results[0]
40
 
41
 
42
+ def evaluate(*args):
43
+ # Create a DataFrame with the parameter names and scaled values
44
+ params_df = pd.DataFrame([args], columns=[param["name"] for param in PARAM_BOUNDS])
45
 
46
+ # Reverse the scaling for each parameter and reverse the renaming for choice parameters
47
+ for param_info in PARAM_BOUNDS:
48
+ key = param_info["name"]
49
+ if param_info["type"] == "range":
50
+ scaler = scalers[key]
51
+ params_df[key] = scaler.inverse_transform(params_df[[key]])
52
+ elif param_info["type"] == "choice":
53
+ # Extract the index from the renamed choice and use it to get the original choice
54
+ choice_index = int(params_df[key].str.split("_").str[-1].iloc[0])
55
+ params_df[key] = param_info["values"][choice_index]
56
 
57
+ # Convert the DataFrame to a list of dictionaries
58
+ params_list = params_df.to_dict("records")
59
+
60
+ # Evaluate the model with the unscaled parameters
61
+ results = model.surrogate_evaluate(params_list)
62
+
63
+ # Convert list of dictionaries to list of lists
64
+ results_list = [list(result.values()) for result in results]
65
+ return results_list
66
+
67
+
68
+ scalers = {
69
+ param_info["name"]: MinMaxScaler()
70
+ for param_info in PARAM_BOUNDS
71
+ if param_info["type"] == "range"
72
+ }
73
+
74
+
75
+ def get_interface(param_info, numeric_index, choice_index):
76
+ key = param_info["name"]
77
+ default_value = example_parameterization[key]
78
+ if param_info["type"] == "range":
79
+ # Rescale the parameter to be between 0 and 1
80
+ scaler = scalers[key]
81
+ scaler.fit([[bound] for bound in param_info["bounds"]])
82
+ scaled_value = scaler.transform([[default_value]])[0][0]
83
+ scaled_bounds = scaler.transform([[bound] for bound in param_info["bounds"]])
84
+ label = f"f1" if key == "train_frac" else f"x{numeric_index}"
85
+ return (
86
+ gr.Number(
87
+ value=scaled_value,
88
+ minimum=scaled_bounds[0][0],
89
+ maximum=scaled_bounds[1][0],
90
+ label=label,
91
+ step=(scaled_bounds[1][0] - scaled_bounds[0][0]) / 100,
92
+ ),
93
+ numeric_index + 1,
94
+ choice_index,
95
+ )
96
+ elif param_info["type"] == "choice":
97
+ return (
98
+ gr.Dropdown(
99
+ choices=[
100
+ f"c{choice_index}_{i}" for i in range(len(param_info["values"]))
101
+ ],
102
+ label=f"c{choice_index}",
103
+ value=f"c{choice_index}_{param_info['values'].index(default_value)}",
104
+ ),
105
+ numeric_index,
106
+ choice_index + 1,
107
+ )
108
+
109
+
110
+ numeric_index = 1
111
+ choice_index = 1
112
+ inputs = []
113
+ for param in PARAM_BOUNDS:
114
+ input, numeric_index, choice_index = get_interface(
115
+ param, numeric_index, choice_index
116
+ )
117
+ inputs.append(input)
118
+
119
+ iface = gr.Interface(
120
+ title="CrabNetSurrogateModel",
121
+ fn=evaluate,
122
+ inputs=inputs,
123
+ outputs=gr.Numpy(
124
+ value=np.array([list(example_result.values())]),
125
+ headers=[f"y{i+1}" for i in range(len(example_result))],
126
+ col_count=(len(example_result), "fixed"),
127
+ datatype=["number"] * len(example_result),
128
+ ),
129
+ description="""
130
+ `y1`, `y2`, `y3`, and `y4`, should all be minimized. `y1` and `y2` are
131
+ correlated, whereas `y1` and `y2` are both anticorrelated with `y3`. `y1`,
132
+ `y2`, and `y3` are stochastic (heteroskedastic, parameter-free noise),
133
+ whereas `y4` is deterministic, but still considered 'black-box'. In other
134
+ words, repeat calls with the same input arguments will result in different
135
+ values for `y1`, `y2`, and `y3`, but the same value for `y4`.
136
+
137
+ If `y1` is less than 0.2, the result is considered "bad" no matter how good
138
+ the other values are. If `y2` is less than 0.7, the result is considered
139
+ "bad" no matter how good the other values are. If `y3` is greater than 1800,
140
+ the result is considered "bad" no matter how good the other values are. If `y4`
141
+ is greater than 40e6, the result is considered "bad" no matter how good the
142
+ other values are.
143
+
144
+ `fidelity1` is a fidelity parameter. 0 is the lowest fidelity, and 1 is the
145
+ highest fidelity. The higher the fidelity, typically the more expensive the
146
+ evaluation. However, this also typically means higher quality and relevance
147
+ to the optimization campaign goals. `fidelity1` and `y3` are
148
+ correlated.
149
+ """,
150
+ )
151
  iface.launch()
surrogate.py CHANGED
@@ -1,34 +1,43 @@
 
 
1
  from joblib import load
 
2
  import pandas as pd
3
  import random
4
- from pydantic import BaseModel, ValidationInfo, field_validator
5
-
6
- PARAM_CONSTRAINTS = {
7
- "N": {"type": "range", "bounds": [1, 10]},
8
- "alpha": {"type": "range", "bounds": [0.0, 1.0]},
9
- "d_model": {"type": "range", "bounds": [100, 1024]},
10
- "dim_feedforward": {"type": "range", "bounds": [1024, 4096]},
11
- "dropout": {"type": "range", "bounds": [0.0, 1.0]},
12
- "emb_scaler": {"type": "range", "bounds": [0.0, 1.0]},
13
- "eps": {"type": "range", "bounds": [1e-7, 1e-4]},
14
- "epochs_step": {"type": "range", "bounds": [5, 20]},
15
- "fudge": {"type": "range", "bounds": [0.0, 0.1]},
16
- "heads": {"type": "range", "bounds": [1, 10]},
17
- "k": {"type": "range", "bounds": [2, 10]},
18
- "lr": {"type": "range", "bounds": [1e-4, 6e-3]},
19
- "pe_resolution": {"type": "range", "bounds": [2500, 10000]},
20
- "ple_resolution": {"type": "range", "bounds": [2500, 10000]},
21
- "pos_scaler": {"type": "range", "bounds": [0.0, 1.0]},
22
- "weight_decay": {"type": "range", "bounds": [0.0, 1.0]},
23
- "batch_size": {"type": "range", "bounds": [32, 256]},
24
- "out_hidden4": {"type": "range", "bounds": [32, 512]},
25
- "betas1": {"type": "range", "bounds": [0.5, 0.9999]},
26
- "betas2": {"type": "range", "bounds": [0.5, 0.9999]},
27
- "bias": {"type": "choice", "values": [False, True]},
28
- "criterion": {"type": "choice", "values": ["RobustL1", "RobustL2"]},
29
- "elem_prop": {"type": "choice", "values": ["mat2vec", "magpie", "onehot"]},
30
- "train_frac": {"type": "range", "bounds": [0.01, 1.0]},
31
- }
 
 
 
 
 
 
32
 
33
 
34
  class Parameterization(BaseModel):
@@ -52,16 +61,17 @@ class Parameterization(BaseModel):
52
  out_hidden4: int
53
  betas1: float
54
  betas2: float
55
- losscurve: bool
56
- learningcurve: bool
57
  bias: bool
58
  criterion: str
59
  elem_prop: str
60
  train_frac: float
61
 
62
  @field_validator("*")
63
- def check_constraints(cls, v: int, info: ValidationInfo) -> int:
64
- param = PARAM_CONSTRAINTS.get(info.field_name)
 
 
 
65
  if param is None:
66
  return v
67
 
@@ -75,60 +85,103 @@ class Parameterization(BaseModel):
75
  if v not in param["values"]:
76
  raise ValueError(f"{info.field_name} must be one of {param['values']}")
77
 
78
- if (
79
- info.field_name in ("betas1", "betas2")
80
- and "betas1" in field.owner
81
- and "betas2" in field.owner
82
- ):
83
- if field.owner["betas1"] > field.owner["betas2"]:
84
- raise ValueError("betas1 must be less than or equal to betas2")
85
- if (
86
- info.field_name in ("emb_scaler", "pos_scaler")
87
- and "emb_scaler" in field.owner
88
- and "pos_scaler" in field.owner
89
- ):
90
- if field.owner["emb_scaler"] + field.owner["pos_scaler"] > 1.0:
91
- raise ValueError(
92
- "The sum of emb_scaler and pos_scaler must be less than or equal to 1.0"
93
- )
94
-
95
  return v
96
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  class CrabNetSurrogateModel(object):
99
- def __init__(self, fpath="surrogate_models.pkl"):
100
  self.models = load(fpath)
101
- pass
102
 
103
- def prepare_params_for_eval(self, raw_params: Parameterization):
104
  raw_params["bias"] = int(raw_params["bias"])
105
  raw_params["use_RobustL1"] = raw_params["criterion"] == "RobustL1"
106
- raw_params["criterion"] = None
107
-
108
- raw_params["losscurve"] = None
109
- raw_params["learningcurve"] = None
110
 
 
 
111
  elem_prop = raw_params["elem_prop"]
112
  raw_params["elem_prop_magpie"] = 0
113
  raw_params["elem_prop_mat2vec"] = 0
114
  raw_params["elem_prop_onehot"] = 0
115
  raw_params[f"elem_prop_{elem_prop}"] = 1
116
- raw_params["elem_prop"] = None
117
 
118
  return raw_params
119
 
120
- def surrogate_evaluate(self, params: Parameterization):
121
-
122
- parameters = self.prepare_params_for_eval(params)
123
- parameters = pd.DataFrame([parameters])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- percentile = random.uniform(0, 1) # generate random percentile
 
 
126
 
127
- mae = self.models["mae"].predict(parameters.assign(mae_rank=[percentile]))
128
- rmse = self.models["rmse"].predict(parameters.assign(rmse_rank=[percentile]))
129
  runtime = self.models["runtime"].predict(
130
- parameters.assign(runtime_rank=[percentile])
 
 
 
 
 
 
 
131
  )
132
- model_size = self.models["model_size"].predict(parameters)
133
 
134
- return mae, rmse, runtime, model_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from click import Parameter
2
+ import numpy as np
3
  from joblib import load
4
+ from typing import List
5
  import pandas as pd
6
  import random
7
+ from pydantic import (
8
+ BaseModel,
9
+ ValidationError,
10
+ ValidationInfo,
11
+ field_validator,
12
+ model_validator,
13
+ )
14
+
15
+ PARAM_BOUNDS = [
16
+ {"name": "N", "type": "range", "bounds": [1, 10]},
17
+ {"name": "alpha", "type": "range", "bounds": [0.0, 1.0]},
18
+ {"name": "d_model", "type": "range", "bounds": [100, 1024]},
19
+ {"name": "dim_feedforward", "type": "range", "bounds": [1024, 4096]},
20
+ {"name": "dropout", "type": "range", "bounds": [0.0, 1.0]},
21
+ {"name": "emb_scaler", "type": "range", "bounds": [0.0, 1.0]},
22
+ {"name": "epochs_step", "type": "range", "bounds": [5, 20]},
23
+ {"name": "eps", "type": "range", "bounds": [1e-7, 1e-4]},
24
+ {"name": "fudge", "type": "range", "bounds": [0.0, 0.1]},
25
+ {"name": "heads", "type": "range", "bounds": [1, 10]},
26
+ {"name": "k", "type": "range", "bounds": [2, 10]},
27
+ {"name": "lr", "type": "range", "bounds": [1e-4, 6e-3]},
28
+ {"name": "pe_resolution", "type": "range", "bounds": [2500, 10000]},
29
+ {"name": "ple_resolution", "type": "range", "bounds": [2500, 10000]},
30
+ {"name": "pos_scaler", "type": "range", "bounds": [0.0, 1.0]},
31
+ {"name": "weight_decay", "type": "range", "bounds": [0.0, 1.0]},
32
+ {"name": "batch_size", "type": "range", "bounds": [32, 256]},
33
+ {"name": "out_hidden4", "type": "range", "bounds": [32, 512]},
34
+ {"name": "betas1", "type": "range", "bounds": [0.5, 0.9999]},
35
+ {"name": "betas2", "type": "range", "bounds": [0.5, 0.9999]},
36
+ {"name": "bias", "type": "choice", "values": [False, True]},
37
+ {"name": "criterion", "type": "choice", "values": ["RobustL1", "RobustL2"]},
38
+ {"name": "elem_prop", "type": "choice", "values": ["mat2vec", "magpie", "onehot"]},
39
+ {"name": "train_frac", "type": "range", "bounds": [0.01, 1.0]},
40
+ ]
41
 
42
 
43
  class Parameterization(BaseModel):
 
61
  out_hidden4: int
62
  betas1: float
63
  betas2: float
 
 
64
  bias: bool
65
  criterion: str
66
  elem_prop: str
67
  train_frac: float
68
 
69
  @field_validator("*")
70
+ def check_bounds(cls, v: int, info: ValidationInfo) -> int:
71
+ param = next(
72
+ (item for item in PARAM_BOUNDS if item["name"] == info.field_name),
73
+ None,
74
+ )
75
  if param is None:
76
  return v
77
 
 
85
  if v not in param["values"]:
86
  raise ValueError(f"{info.field_name} must be one of {param['values']}")
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  return v
89
 
90
+ @model_validator(mode="after")
91
+ def check_constraints(self) -> "Parameterization":
92
+ if self.betas1 > self.betas2:
93
+ raise ValueError(
94
+ f"Received betas1={self.betas1} which should be less than betas2={self.betas2}"
95
+ )
96
+ if self.emb_scaler + self.pos_scaler > 1.0:
97
+ raise ValueError(
98
+ f"Received emb_scaler={self.emb_scaler} and pos_scaler={self.pos_scaler} which should sum to less than or equal to 1.0" # noqa: E501
99
+ )
100
+
101
 
102
  class CrabNetSurrogateModel(object):
103
+ def __init__(self, fpath="models/surrogate_models_hgbr_opt.pkl"):
104
  self.models = load(fpath)
 
105
 
106
+ def prepare_params_for_eval(self, raw_params: dict):
107
  raw_params["bias"] = int(raw_params["bias"])
108
  raw_params["use_RobustL1"] = raw_params["criterion"] == "RobustL1"
109
+ del raw_params["criterion"]
 
 
 
110
 
111
+ # REVIEW: HistGradientBoostingRegressor handles categoricals natively now
112
+ # https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-categorical-py # noqa: E501
113
  elem_prop = raw_params["elem_prop"]
114
  raw_params["elem_prop_magpie"] = 0
115
  raw_params["elem_prop_mat2vec"] = 0
116
  raw_params["elem_prop_onehot"] = 0
117
  raw_params[f"elem_prop_{elem_prop}"] = 1
118
+ del raw_params["elem_prop"]
119
 
120
  return raw_params
121
 
122
+ def surrogate_evaluate(
123
+ self, params_list: List[dict], seed=None, remove_noise=False
124
+ ):
125
+ assert isinstance(params_list, list), "Input must be a list of dictionaries"
126
+ # Validate the parameters (i.e., will throw error if invalid)
127
+ [Parameterization(**params) for params in params_list]
128
+
129
+ parameters = pd.DataFrame(params_list)
130
+ parameters = parameters.apply(self.prepare_params_for_eval, axis=1)
131
+
132
+ if remove_noise:
133
+ mae_percentiles = [0.5] * len(parameters)
134
+ rmse_percentiles = [0.5] * len(parameters)
135
+ runtime_percentiles = [0.5] * len(parameters)
136
+ else:
137
+ # Random number generator, without seed (intentional)
138
+ rng = np.random.default_rng(seed)
139
+
140
+ # Generate random percentiles for each set of parameters for
141
+ # heteroskedastic, parameter-free noise
142
+ mae_percentiles = rng.uniform(0, 1, size=len(parameters))
143
+ rmse_percentiles = mae_percentiles # typically correlated with MAE
144
+
145
+ # typically anticorrelated with MAE/RMSE
146
+ runtime_percentiles = 1 - mae_percentiles
147
+
148
+ # Make predictions for each model
149
+ mae_model = self.models["mae"]
150
+ rmse_model = self.models["rmse"]
151
+ runtime_model = self.models["runtime"]
152
+ model_size_model = self.models["model_size"]
153
+
154
+ # NOTE: The model expects the variables in the same order as when it was fit
155
+ mae = self.models["mae"].predict(
156
+ parameters.assign(mae_rank=mae_percentiles)[mae_model.feature_names_in_]
157
+ )
158
 
159
+ rmse = self.models["rmse"].predict(
160
+ parameters.assign(rmse_rank=rmse_percentiles)[rmse_model.feature_names_in_]
161
+ )
162
 
 
 
163
  runtime = self.models["runtime"].predict(
164
+ parameters.assign(runtime_rank=runtime_percentiles)[
165
+ runtime_model.feature_names_in_
166
+ ]
167
+ )
168
+
169
+ # Model size is deterministic (hence no rank variable)
170
+ model_size = self.models["model_size"].predict(
171
+ parameters[model_size_model.feature_names_in_]
172
  )
 
173
 
174
+ # Combine predictions into a list of dictionaries
175
+ results = [
176
+ {"mae": m, "rmse": r, "runtime": rt, "model_size": ms}
177
+ for m, r, rt, ms in zip(mae, rmse, runtime, model_size)
178
+ ]
179
+
180
+ return results
181
+
182
+
183
+ # %% Code Graveyard
184
+
185
+ # runtime_percentiles = np.random.uniform(
186
+ # 0, 1, size=len(parameters)
187
+ # )