import time import joblib from os import path from pathlib import Path import pandas as pd import numpy as np import matplotlib.pyplot as plt # from joblib import Parallel, delayed from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import KFold from scipy.stats import uniform, randint model_type = "hgbr" # "hgbr" or "rfr" optimize_hyperparameters = True dummy = False n_jobs = -1 # Number of jobs to run in parallel. -1 means using all processors. data_dir = "." model_dir = "models" assert model_type in [ "hgbr", "rfr", ], f"Invalid model type: {model_type}, must be 'hgbr' or 'rfr'" if dummy: model_dir = path.join(model_dir, "dummy") Path(model_dir).mkdir(exist_ok=True, parents=True) sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv")) if dummy: data_dir = path.join(data_dir, "dummy") sobol_reg = sobol_reg.head(100) Path(data_dir).mkdir(exist_ok=True, parents=True) elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop") hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware") sobol_reg["use_RobustL1"] = sobol_reg["criterion"] == "RobustL1" sobol_reg["bias"] = sobol_reg["bias"].astype(int) sobol_reg = pd.concat([sobol_reg, elemprop_ohe], axis=1) common_features = [ "N", "alpha", "d_model", "dim_feedforward", "dropout", "emb_scaler", "eps", "epochs_step", "fudge", "heads", "k", "lr", "pe_resolution", "ple_resolution", "pos_scaler", "weight_decay", "batch_size", "out_hidden4", "betas1", "betas2", "train_frac", "bias", "use_RobustL1", "elem_prop_magpie", "elem_prop_mat2vec", "elem_prop_onehot", ] mae_features = common_features + ["mae_rank"] X_array_mae = sobol_reg[mae_features] y_array_mae = sobol_reg[["mae"]] mae_model_stem = path.join(model_dir, "sobol_reg_mae") rmse_features = common_features + ["rmse_rank"] X_array_rmse = sobol_reg[rmse_features] y_array_rmse = sobol_reg[["rmse"]] rmse_model_stem = path.join(model_dir, "sobol_reg_rmse") # no model_size_rank because model_size is deterministic via # `crabnet.utils.utils.count_parameters` model_size_features = common_features X_array_model_size = sobol_reg[model_size_features] y_array_model_size = sobol_reg[["model_size"]] model_size_model_stem = path.join(model_dir, "sobol_reg_model_size") runtime_features = common_features + ["runtime_rank"] X_array_runtime = sobol_reg[runtime_features] y_array_runtime = sobol_reg[["runtime"]] runtime_model_stem = path.join(model_dir, "sobol_reg_runtime") def train_and_save( sr_feat_array, sr_labels_array, sr_label_names, optimize_hyperparameters=False, ): models = {} timings = {} # cv_scores = [] avg_cv_scores = {} cv_predictions = {} for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names): y1 = y1.squeeze() print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}") if model_type == "rfr": model = RandomForestRegressor(random_state=13) elif model_type == "hgbr": model = HistGradientBoostingRegressor(random_state=13) if optimize_hyperparameters: # define hyperparameters to tune if model.__class__.__name__ == "HistGradientBoostingRegressor": param_dist = { "max_iter": randint(100, 200), "max_leaf_nodes": [None, 30, 50], "learning_rate": uniform(0.01, 0.1), # Add more hyperparameters here as needed } elif model.__class__.__name__ == "RandomForestRegressor": param_dist = { "n_estimators": randint(100, 200), "max_features": ["auto", "sqrt"], "max_depth": randint(10, 50), "min_samples_split": randint(2, 10), # Add more hyperparameters here as needed } # Use RandomizedSearchCV to tune the hyperparameters random_search = RandomizedSearchCV( model, param_dist, n_iter=10, cv=5, scoring="neg_mean_squared_error", random_state=13, n_jobs=n_jobs, ) start_time = time.time() # REVIEW: use y1.values.ravel() instead of y1 to flatten y1 to a 1D array random_search.fit(X1, y1) end_time = time.time() # Use the best estimator found by RandomizedSearchCV model = random_search.best_estimator_ timings[name1] = end_time - start_time else: start_time = time.time() model.fit(X1, y1) end_time = time.time() timings[name1] = end_time - start_time print(f"Trained {name1} in {timings[name1]} seconds") # Perform cross-validation manually to keep track of predictions # NOTE: This doesn't use GroupKFold, which would prevent cross-leakage for the rank column # cv = KFold(n_splits=5) # cv_preds = [] # for train_index, test_index in cv.split(X1): # X_train, X_test = X1.iloc[train_index], X1.iloc[test_index] # y_train, y_test = y1.iloc[train_index], y1.iloc[test_index] # model.fit(X_train, y_train) # preds = model.predict(X_test) # cv_preds.extend(preds) # cv_scores.append(mean_squared_error(y_test, preds)) # avg_cv_scores[name1] = np.sqrt(np.mean(cv_scores)) # cv_predictions[name1] = cv_preds def cross_validate(X1, y1, model): cv = KFold(n_splits=5) cv_preds = [] cv_scores = [] for train_index, test_index in cv.split(X1): X_train, X_test = X1.iloc[train_index], X1.iloc[test_index] y_train, y_test = y1.iloc[train_index], y1.iloc[test_index] model.fit(X_train, y_train) preds = model.predict(X_test) cv_preds.extend(preds) cv_scores.append(mean_squared_error(y_test, preds)) return cv_preds, np.sqrt(np.mean(cv_scores)) cv_predictions[name1], avg_cv_scores[name1] = cross_validate(X1, y1, model) # # Parallelize the outer loop # results = Parallel(n_jobs=n_jobs)( # delayed(cross_validate)(X1, y1, model) # for X1, y1 in zip(sr_feat_array, sr_labels_array) # ) # # Unpack the results # cv_predictions, avg_cv_scores = zip(*results) # # Convert the results to dictionaries # cv_predictions = dict(zip(sobol_reg_target_names, cv_predictions)) # avg_cv_scores = dict(zip(sobol_reg_target_names, avg_cv_scores)) print(f"Cross-validated score for {name1}: {avg_cv_scores[name1]}") models[name1] = model print() return models, timings, avg_cv_scores, cv_predictions # List of x_arrays, y_arrays, and target_names sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime] sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime] sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"] # Train and save the model on all the data models, timings, avg_cv_scores, cv_predictions = train_and_save( sobol_reg_x_arrays, sobol_reg_labels, sobol_reg_target_names, optimize_hyperparameters=optimize_hyperparameters, # if true, probably ~16 min for iter=5 & cv=3 ) print(f"Timings (in seconds): {timings}") # doesn't include cross_val_score runtime print(f"Cross-validated scores: {avg_cv_scores}") # Save timings and cv_scores to a CSV file results = pd.DataFrame( { "Model": list(timings.keys()), "Timing": list(timings.values()), "CV Score": list(avg_cv_scores.values()), } ) # Determine the model type and optimization status model_type = ( "hgbr" if isinstance(next(iter(models.values())), HistGradientBoostingRegressor) else "rfr" ) opt_status = "opt" if optimize_hyperparameters else "no_opt" # Save the results and models with the updated filenames results_filename = f"model_results_{model_type}_{opt_status}.csv" models_filename = f"surrogate_models_{model_type}_{opt_status}.pkl" results.to_csv(path.join(model_dir, results_filename), index=False) joblib.dump(models, path.join(model_dir, models_filename), compress=7) # NOTE: Can use this if looking at how well it memorizes the training data # # Generate predictions for each model # predictions = { # name: model.predict(X) # for name, model, X in zip( # sobol_reg_target_names, models.values(), sobol_reg_x_arrays # ) # } # Create a 2x2 grid of subplots fig, axs = plt.subplots(2, 2, figsize=(8, 8)) # Flatten the axs array for easy iteration axs = axs.flatten() for ax, name in zip(axs, sobol_reg_target_names): # Get the true and predicted values for this model true_values = sobol_reg[name] predicted_values = cv_predictions[name] # Create the hexbin plot with log scaling hb = ax.hexbin( true_values, predicted_values, gridsize=50, cmap="viridis", bins="log" ) cb = plt.colorbar(hb, ax=ax) cb.set_label("counts (log scale)") ax.plot( [true_values.min(), true_values.max()], [true_values.min(), true_values.max()], "w--", ) ax.set_xlabel("True Values") ax.set_ylabel("Predicted Values") ax.set_title(f"Parity Plot for {name}") # Set the aspect ratio to be equal ax.set_aspect("equal") # Adjust the layout and show the plot plt.tight_layout() # Save the plot with the updated filename plot_filename = f"parity_plot_{model_type}_{opt_status}.png" plt.savefig(path.join(model_dir, plot_filename), dpi=300) plt.show() 1 + 1 # %% Code Graveyard # # Compute cross-validated score # cv_score = cross_val_score( # model, X1, y1, cv=5, scoring="neg_mean_squared_error" # ) # cv_scores[name1] = np.sqrt(np.abs(cv_score.mean()))