|
import time |
|
import joblib |
|
from os import path |
|
from pathlib import Path |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor |
|
from sklearn.metrics import mean_squared_error |
|
from sklearn.model_selection import RandomizedSearchCV |
|
from sklearn.model_selection import KFold |
|
|
|
from scipy.stats import uniform, randint |
|
|
|
model_type = "hgbr" |
|
optimize_hyperparameters = True |
|
dummy = False |
|
n_jobs = -1 |
|
|
|
data_dir = "." |
|
model_dir = "models" |
|
|
|
assert model_type in [ |
|
"hgbr", |
|
"rfr", |
|
], f"Invalid model type: {model_type}, must be 'hgbr' or 'rfr'" |
|
|
|
if dummy: |
|
model_dir = path.join(model_dir, "dummy") |
|
|
|
Path(model_dir).mkdir(exist_ok=True, parents=True) |
|
|
|
sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv")) |
|
|
|
if dummy: |
|
data_dir = path.join(data_dir, "dummy") |
|
sobol_reg = sobol_reg.head(100) |
|
|
|
Path(data_dir).mkdir(exist_ok=True, parents=True) |
|
|
|
elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop") |
|
hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware") |
|
|
|
sobol_reg["use_RobustL1"] = sobol_reg["criterion"] == "RobustL1" |
|
|
|
sobol_reg["bias"] = sobol_reg["bias"].astype(int) |
|
|
|
sobol_reg = pd.concat([sobol_reg, elemprop_ohe], axis=1) |
|
|
|
common_features = [ |
|
"N", |
|
"alpha", |
|
"d_model", |
|
"dim_feedforward", |
|
"dropout", |
|
"emb_scaler", |
|
"eps", |
|
"epochs_step", |
|
"fudge", |
|
"heads", |
|
"k", |
|
"lr", |
|
"pe_resolution", |
|
"ple_resolution", |
|
"pos_scaler", |
|
"weight_decay", |
|
"batch_size", |
|
"out_hidden4", |
|
"betas1", |
|
"betas2", |
|
"train_frac", |
|
"bias", |
|
"use_RobustL1", |
|
"elem_prop_magpie", |
|
"elem_prop_mat2vec", |
|
"elem_prop_onehot", |
|
] |
|
|
|
|
|
mae_features = common_features + ["mae_rank"] |
|
X_array_mae = sobol_reg[mae_features] |
|
y_array_mae = sobol_reg[["mae"]] |
|
mae_model_stem = path.join(model_dir, "sobol_reg_mae") |
|
|
|
rmse_features = common_features + ["rmse_rank"] |
|
X_array_rmse = sobol_reg[rmse_features] |
|
y_array_rmse = sobol_reg[["rmse"]] |
|
rmse_model_stem = path.join(model_dir, "sobol_reg_rmse") |
|
|
|
|
|
|
|
model_size_features = common_features |
|
X_array_model_size = sobol_reg[model_size_features] |
|
y_array_model_size = sobol_reg[["model_size"]] |
|
model_size_model_stem = path.join(model_dir, "sobol_reg_model_size") |
|
|
|
runtime_features = common_features + ["runtime_rank"] |
|
X_array_runtime = sobol_reg[runtime_features] |
|
y_array_runtime = sobol_reg[["runtime"]] |
|
runtime_model_stem = path.join(model_dir, "sobol_reg_runtime") |
|
|
|
|
|
def train_and_save( |
|
sr_feat_array, |
|
sr_labels_array, |
|
sr_label_names, |
|
optimize_hyperparameters=False, |
|
): |
|
models = {} |
|
timings = {} |
|
|
|
avg_cv_scores = {} |
|
cv_predictions = {} |
|
|
|
for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names): |
|
y1 = y1.squeeze() |
|
print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}") |
|
|
|
if model_type == "rfr": |
|
model = RandomForestRegressor(random_state=13) |
|
elif model_type == "hgbr": |
|
model = HistGradientBoostingRegressor(random_state=13) |
|
|
|
if optimize_hyperparameters: |
|
|
|
if model.__class__.__name__ == "HistGradientBoostingRegressor": |
|
param_dist = { |
|
"max_iter": randint(100, 200), |
|
"max_leaf_nodes": [None, 30, 50], |
|
"learning_rate": uniform(0.01, 0.1), |
|
|
|
} |
|
elif model.__class__.__name__ == "RandomForestRegressor": |
|
param_dist = { |
|
"n_estimators": randint(100, 200), |
|
"max_features": ["auto", "sqrt"], |
|
"max_depth": randint(10, 50), |
|
"min_samples_split": randint(2, 10), |
|
|
|
} |
|
|
|
|
|
random_search = RandomizedSearchCV( |
|
model, |
|
param_dist, |
|
n_iter=10, |
|
cv=5, |
|
scoring="neg_mean_squared_error", |
|
random_state=13, |
|
n_jobs=n_jobs, |
|
) |
|
|
|
start_time = time.time() |
|
|
|
random_search.fit(X1, y1) |
|
end_time = time.time() |
|
|
|
|
|
model = random_search.best_estimator_ |
|
timings[name1] = end_time - start_time |
|
else: |
|
start_time = time.time() |
|
model.fit(X1, y1) |
|
end_time = time.time() |
|
timings[name1] = end_time - start_time |
|
|
|
print(f"Trained {name1} in {timings[name1]} seconds") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cross_validate(X1, y1, model): |
|
cv = KFold(n_splits=5) |
|
cv_preds = [] |
|
cv_scores = [] |
|
for train_index, test_index in cv.split(X1): |
|
X_train, X_test = X1.iloc[train_index], X1.iloc[test_index] |
|
y_train, y_test = y1.iloc[train_index], y1.iloc[test_index] |
|
model.fit(X_train, y_train) |
|
preds = model.predict(X_test) |
|
cv_preds.extend(preds) |
|
cv_scores.append(mean_squared_error(y_test, preds)) |
|
return cv_preds, np.sqrt(np.mean(cv_scores)) |
|
|
|
cv_predictions[name1], avg_cv_scores[name1] = cross_validate(X1, y1, model) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Cross-validated score for {name1}: {avg_cv_scores[name1]}") |
|
|
|
models[name1] = model |
|
|
|
print() |
|
|
|
return models, timings, avg_cv_scores, cv_predictions |
|
|
|
|
|
|
|
sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime] |
|
sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime] |
|
sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"] |
|
|
|
|
|
models, timings, avg_cv_scores, cv_predictions = train_and_save( |
|
sobol_reg_x_arrays, |
|
sobol_reg_labels, |
|
sobol_reg_target_names, |
|
optimize_hyperparameters=optimize_hyperparameters, |
|
) |
|
|
|
print(f"Timings (in seconds): {timings}") |
|
print(f"Cross-validated scores: {avg_cv_scores}") |
|
|
|
|
|
results = pd.DataFrame( |
|
{ |
|
"Model": list(timings.keys()), |
|
"Timing": list(timings.values()), |
|
"CV Score": list(avg_cv_scores.values()), |
|
} |
|
) |
|
|
|
|
|
model_type = ( |
|
"hgbr" |
|
if isinstance(next(iter(models.values())), HistGradientBoostingRegressor) |
|
else "rfr" |
|
) |
|
opt_status = "opt" if optimize_hyperparameters else "no_opt" |
|
|
|
|
|
results_filename = f"model_results_{model_type}_{opt_status}.csv" |
|
models_filename = f"surrogate_models_{model_type}_{opt_status}.pkl" |
|
|
|
results.to_csv(path.join(model_dir, results_filename), index=False) |
|
joblib.dump(models, path.join(model_dir, models_filename), compress=7) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fig, axs = plt.subplots(2, 2, figsize=(8, 8)) |
|
|
|
|
|
axs = axs.flatten() |
|
|
|
for ax, name in zip(axs, sobol_reg_target_names): |
|
|
|
true_values = sobol_reg[name] |
|
predicted_values = cv_predictions[name] |
|
|
|
|
|
hb = ax.hexbin( |
|
true_values, predicted_values, gridsize=50, cmap="viridis", bins="log" |
|
) |
|
cb = plt.colorbar(hb, ax=ax) |
|
cb.set_label("counts (log scale)") |
|
|
|
ax.plot( |
|
[true_values.min(), true_values.max()], |
|
[true_values.min(), true_values.max()], |
|
"w--", |
|
) |
|
ax.set_xlabel("True Values") |
|
ax.set_ylabel("Predicted Values") |
|
ax.set_title(f"Parity Plot for {name}") |
|
|
|
|
|
ax.set_aspect("equal") |
|
|
|
|
|
plt.tight_layout() |
|
|
|
|
|
plot_filename = f"parity_plot_{model_type}_{opt_status}.png" |
|
plt.savefig(path.join(model_dir, plot_filename), dpi=300) |
|
|
|
plt.show() |
|
|
|
1 + 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|