File size: 7,077 Bytes
45b3fc1 855113e 45b3fc1 855113e 45b3fc1 4fb3259 b3dff97 4fb3259 b3dff97 4fb3259 b3dff97 4fb3259 b3dff97 4fb3259 b3dff97 4fb3259 b3dff97 4fb3259 45b3fc1 4fb3259 855113e 45b3fc1 855113e 45b3fc1 4fb3259 855113e 45b3fc1 855113e 45b3fc1 855113e 45b3fc1 855113e 45b3fc1 855113e 45b3fc1 855113e 45b3fc1 855113e 45b3fc1 855113e 45b3fc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
from click import Parameter
import numpy as np
from joblib import load
from typing import List
import pandas as pd
import random
from pydantic import (
BaseModel,
ValidationError,
ValidationInfo,
field_validator,
model_validator,
)
PARAM_BOUNDS = [
{"name": "N", "type": "range", "bounds": [1, 10]},
{"name": "alpha", "type": "range", "bounds": [0.0, 1.0]},
{"name": "d_model", "type": "range", "bounds": [100, 1024]},
{"name": "dim_feedforward", "type": "range", "bounds": [1024, 4096]},
{"name": "dropout", "type": "range", "bounds": [0.0, 1.0]},
{"name": "emb_scaler", "type": "range", "bounds": [0.0, 1.0]},
{"name": "epochs_step", "type": "range", "bounds": [5, 20]},
{"name": "eps", "type": "range", "bounds": [1e-7, 1e-4]},
{"name": "fudge", "type": "range", "bounds": [0.0, 0.1]},
{"name": "heads", "type": "range", "bounds": [1, 10]},
{"name": "k", "type": "range", "bounds": [2, 10]},
{"name": "lr", "type": "range", "bounds": [1e-4, 6e-3]},
{"name": "pe_resolution", "type": "range", "bounds": [2500, 10000]},
{"name": "ple_resolution", "type": "range", "bounds": [2500, 10000]},
{"name": "pos_scaler", "type": "range", "bounds": [0.0, 1.0]},
{"name": "weight_decay", "type": "range", "bounds": [0.0, 1.0]},
{"name": "batch_size", "type": "range", "bounds": [32, 256]},
{"name": "out_hidden4", "type": "range", "bounds": [32, 512]},
{"name": "betas1", "type": "range", "bounds": [0.5, 0.9999]},
{"name": "betas2", "type": "range", "bounds": [0.5, 0.9999]},
{"name": "bias", "type": "choice", "values": [False, True]},
{"name": "criterion", "type": "choice", "values": ["RobustL1", "RobustL2"]},
{"name": "elem_prop", "type": "choice", "values": ["mat2vec", "magpie", "onehot"]},
{"name": "train_frac", "type": "range", "bounds": [0.01, 1.0]},
]
class Parameterization(BaseModel):
N: float # int
alpha: float
d_model: float # int
dim_feedforward: float # int
dropout: float
emb_scaler: float
epochs_step: float # int
eps: float
fudge: float
heads: float # int
k: float # int
lr: float
pe_resolution: float # int
ple_resolution: float # int
pos_scaler: float
weight_decay: float # int
batch_size: float # int
out_hidden4: float # int
betas1: float
betas2: float
bias: bool
criterion: str
elem_prop: str
train_frac: float
@field_validator("*")
def check_bounds(cls, v: int, info: ValidationInfo) -> int:
param = next(
(item for item in PARAM_BOUNDS if item["name"] == info.field_name),
None,
)
if param is None:
return v
if param["type"] == "range":
min_val, max_val = param["bounds"]
if not min_val <= v <= max_val:
raise ValueError(
f"{info.field_name} must be between {min_val} and {max_val}"
)
elif param["type"] == "choice":
if v not in param["values"]:
raise ValueError(f"{info.field_name} must be one of {param['values']}")
return v
@model_validator(mode="after")
def check_constraints(self) -> "Parameterization":
if self.betas1 > self.betas2:
raise ValueError(
f"Received betas1={self.betas1} which should be less than betas2={self.betas2}"
)
if self.emb_scaler + self.pos_scaler > 1.0:
raise ValueError(
f"Received emb_scaler={self.emb_scaler} and pos_scaler={self.pos_scaler} which should sum to less than or equal to 1.0" # noqa: E501
)
class CrabNetSurrogateModel(object):
def __init__(self, fpath="models/surrogate_models_hgbr_opt.pkl"):
self.models = load(fpath)
def prepare_params_for_eval(self, raw_params: dict):
raw_params["bias"] = int(raw_params["bias"])
raw_params["use_RobustL1"] = raw_params["criterion"] == "RobustL1"
del raw_params["criterion"]
# REVIEW: HistGradientBoostingRegressor handles categoricals natively now
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-categorical-py # noqa: E501
elem_prop = raw_params["elem_prop"]
raw_params["elem_prop_magpie"] = 0
raw_params["elem_prop_mat2vec"] = 0
raw_params["elem_prop_onehot"] = 0
raw_params[f"elem_prop_{elem_prop}"] = 1
del raw_params["elem_prop"]
return raw_params
def surrogate_evaluate(
self, params_list: List[dict], seed=None, remove_noise=False
):
assert isinstance(params_list, list), "Input must be a list of dictionaries"
# Validate the parameters (i.e., will throw error if invalid)
[Parameterization(**params) for params in params_list]
parameters = pd.DataFrame(params_list)
parameters = parameters.apply(self.prepare_params_for_eval, axis=1)
if remove_noise:
mae_percentiles = [0.5] * len(parameters)
rmse_percentiles = [0.5] * len(parameters)
runtime_percentiles = [0.5] * len(parameters)
else:
# Random number generator, without seed (intentional)
rng = np.random.default_rng(seed)
# Generate random percentiles for each set of parameters for
# heteroskedastic, parameter-free noise
mae_percentiles = rng.uniform(0, 1, size=len(parameters))
rmse_percentiles = mae_percentiles # typically correlated with MAE
# typically anticorrelated with MAE/RMSE
runtime_percentiles = 1 - mae_percentiles
# Make predictions for each model
mae_model = self.models["mae"]
rmse_model = self.models["rmse"]
runtime_model = self.models["runtime"]
model_size_model = self.models["model_size"]
# NOTE: The model expects the variables in the same order as when it was fit
mae = self.models["mae"].predict(
parameters.assign(mae_rank=mae_percentiles)[mae_model.feature_names_in_]
)
rmse = self.models["rmse"].predict(
parameters.assign(rmse_rank=rmse_percentiles)[rmse_model.feature_names_in_]
)
runtime = self.models["runtime"].predict(
parameters.assign(runtime_rank=runtime_percentiles)[
runtime_model.feature_names_in_
]
)
# Model size is deterministic (hence no rank variable)
model_size = self.models["model_size"].predict(
parameters[model_size_model.feature_names_in_]
)
# Combine predictions into a list of dictionaries
results = [
{"mae": m, "rmse": r, "runtime": rt, "model_size": ms}
for m, r, rt, ms in zip(mae, rmse, runtime, model_size)
]
return results
# %% Code Graveyard
# runtime_percentiles = np.random.uniform(
# 0, 1, size=len(parameters)
# )
|