Spaces:

AccelerationConsortium
/

crabnet-hyperparameter

Running

App Files Files Community

sgbaird commited on Mar 27, 2024

Commit

58815da

1 Parent(s): b8fce21

Refactor evaluate function in app.py to include parameter scaling and unscaled evaluation

Browse files

Files changed (1) hide show

train_surrogate.py +320 -3

train_surrogate.py CHANGED Viewed

@@ -1,3 +1,320 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d70119e59352312f64ab5b620d6f1ccc62616af7ed03ab3efa09ac49b814c019
-size 10257

+import time
+import joblib
+from os import path
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+# from joblib import Parallel, delayed
+from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import KFold
+from scipy.stats import uniform, randint
+model_type = "hgbr"  # "hgbr" or "rfr"
+optimize_hyperparameters = True
+dummy = False
+n_jobs = -1  # Number of jobs to run in parallel. -1 means using all processors.
+data_dir = "."
+model_dir = "models"
+assert model_type in [
+    "hgbr",
+    "rfr",
+], f"Invalid model type: {model_type}, must be 'hgbr' or 'rfr'"
+if dummy:
+    model_dir = path.join(model_dir, "dummy")
+Path(model_dir).mkdir(exist_ok=True, parents=True)
+sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))
+if dummy:
+    data_dir = path.join(data_dir, "dummy")
+    sobol_reg = sobol_reg.head(100)
+Path(data_dir).mkdir(exist_ok=True, parents=True)
+elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
+hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")
+sobol_reg["use_RobustL1"] = sobol_reg["criterion"] == "RobustL1"
+sobol_reg["bias"] = sobol_reg["bias"].astype(int)
+sobol_reg = pd.concat([sobol_reg, elemprop_ohe], axis=1)
+common_features = [
+    "N",
+    "alpha",
+    "d_model",
+    "dim_feedforward",
+    "dropout",
+    "emb_scaler",
+    "eps",
+    "epochs_step",
+    "fudge",
+    "heads",
+    "k",
+    "lr",
+    "pe_resolution",
+    "ple_resolution",
+    "pos_scaler",
+    "weight_decay",
+    "batch_size",
+    "out_hidden4",
+    "betas1",
+    "betas2",
+    "train_frac",
+    "bias",
+    "use_RobustL1",
+    "elem_prop_magpie",
+    "elem_prop_mat2vec",
+    "elem_prop_onehot",
+]
+mae_features = common_features + ["mae_rank"]
+X_array_mae = sobol_reg[mae_features]
+y_array_mae = sobol_reg[["mae"]]
+mae_model_stem = path.join(model_dir, "sobol_reg_mae")
+rmse_features = common_features + ["rmse_rank"]
+X_array_rmse = sobol_reg[rmse_features]
+y_array_rmse = sobol_reg[["rmse"]]
+rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")
+# no model_size_rank because model_size is deterministic via
+# `crabnet.utils.utils.count_parameters`
+model_size_features = common_features
+X_array_model_size = sobol_reg[model_size_features]
+y_array_model_size = sobol_reg[["model_size"]]
+model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")
+runtime_features = common_features + ["runtime_rank"]
+X_array_runtime = sobol_reg[runtime_features]
+y_array_runtime = sobol_reg[["runtime"]]
+runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")
+def train_and_save(
+    sr_feat_array,
+    sr_labels_array,
+    sr_label_names,
+    optimize_hyperparameters=False,
+):
+    models = {}
+    timings = {}
+    # cv_scores = []
+    avg_cv_scores = {}
+    cv_predictions = {}
+    for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names):
+        y1 = y1.squeeze()
+        print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}")
+        if model_type == "rfr":
+            model = RandomForestRegressor(random_state=13)
+        elif model_type == "hgbr":
+            model = HistGradientBoostingRegressor(random_state=13)
+        if optimize_hyperparameters:
+            # define hyperparameters to tune
+            if model.__class__.__name__ == "HistGradientBoostingRegressor":
+                param_dist = {
+                    "max_iter": randint(100, 200),
+                    "max_leaf_nodes": [None, 30, 50],
+                    "learning_rate": uniform(0.01, 0.1),
+                    # Add more hyperparameters here as needed
+                }
+            elif model.__class__.__name__ == "RandomForestRegressor":
+                param_dist = {
+                    "n_estimators": randint(100, 200),
+                    "max_features": ["auto", "sqrt"],
+                    "max_depth": randint(10, 50),
+                    "min_samples_split": randint(2, 10),
+                    # Add more hyperparameters here as needed
+                }
+            # Use RandomizedSearchCV to tune the hyperparameters
+            random_search = RandomizedSearchCV(
+                model,
+                param_dist,
+                n_iter=10,
+                cv=5,
+                scoring="neg_mean_squared_error",
+                random_state=13,
+                n_jobs=n_jobs,
+            )
+            start_time = time.time()
+            # REVIEW: use y1.values.ravel() instead of y1 to flatten y1 to a 1D array
+            random_search.fit(X1, y1)
+            end_time = time.time()
+            # Use the best estimator found by RandomizedSearchCV
+            model = random_search.best_estimator_
+            timings[name1] = end_time - start_time
+        else:
+            start_time = time.time()
+            model.fit(X1, y1)
+            end_time = time.time()
+            timings[name1] = end_time - start_time
+        print(f"Trained {name1} in {timings[name1]} seconds")
+        # Perform cross-validation manually to keep track of predictions
+        # NOTE: This doesn't use GroupKFold, which would prevent cross-leakage for the rank column
+        # cv = KFold(n_splits=5)
+        # cv_preds = []
+        # for train_index, test_index in cv.split(X1):
+        #     X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
+        #     y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
+        #     model.fit(X_train, y_train)
+        #     preds = model.predict(X_test)
+        #     cv_preds.extend(preds)
+        #     cv_scores.append(mean_squared_error(y_test, preds))
+        # avg_cv_scores[name1] = np.sqrt(np.mean(cv_scores))
+        # cv_predictions[name1] = cv_preds
+        def cross_validate(X1, y1, model):
+            cv = KFold(n_splits=5)
+            cv_preds = []
+            cv_scores = []
+            for train_index, test_index in cv.split(X1):
+                X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
+                y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
+                model.fit(X_train, y_train)
+                preds = model.predict(X_test)
+                cv_preds.extend(preds)
+                cv_scores.append(mean_squared_error(y_test, preds))
+            return cv_preds, np.sqrt(np.mean(cv_scores))
+        cv_predictions[name1], avg_cv_scores[name1] = cross_validate(X1, y1, model)
+        # # Parallelize the outer loop
+        # results = Parallel(n_jobs=n_jobs)(
+        #     delayed(cross_validate)(X1, y1, model)
+        #     for X1, y1 in zip(sr_feat_array, sr_labels_array)
+        # )
+        # # Unpack the results
+        # cv_predictions, avg_cv_scores = zip(*results)
+        # # Convert the results to dictionaries
+        # cv_predictions = dict(zip(sobol_reg_target_names, cv_predictions))
+        # avg_cv_scores = dict(zip(sobol_reg_target_names, avg_cv_scores))
+        print(f"Cross-validated score for {name1}: {avg_cv_scores[name1]}")
+        models[name1] = model
+        print()
+    return models, timings, avg_cv_scores, cv_predictions
+# List of x_arrays, y_arrays, and target_names
+sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime]
+sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime]
+sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"]
+# Train and save the model on all the data
+models, timings, avg_cv_scores, cv_predictions = train_and_save(
+    sobol_reg_x_arrays,
+    sobol_reg_labels,
+    sobol_reg_target_names,
+    optimize_hyperparameters=optimize_hyperparameters,  # if true, probably ~16 min for iter=5 & cv=3
+)
+print(f"Timings (in seconds): {timings}")  # doesn't include cross_val_score runtime
+print(f"Cross-validated scores: {avg_cv_scores}")
+# Save timings and cv_scores to a CSV file
+results = pd.DataFrame(
+    {
+        "Model": list(timings.keys()),
+        "Timing": list(timings.values()),
+        "CV Score": list(avg_cv_scores.values()),
+    }
+)
+# Determine the model type and optimization status
+model_type = (
+    "hgbr"
+    if isinstance(next(iter(models.values())), HistGradientBoostingRegressor)
+    else "rfr"
+)
+opt_status = "opt" if optimize_hyperparameters else "no_opt"
+# Save the results and models with the updated filenames
+results_filename = f"model_results_{model_type}_{opt_status}.csv"
+models_filename = f"surrogate_models_{model_type}_{opt_status}.pkl"
+results.to_csv(path.join(model_dir, results_filename), index=False)
+joblib.dump(models, path.join(model_dir, models_filename), compress=7)
+# NOTE: Can use this if looking at how well it memorizes the training data
+# # Generate predictions for each model
+# predictions = {
+#     name: model.predict(X)
+#     for name, model, X in zip(
+#         sobol_reg_target_names, models.values(), sobol_reg_x_arrays
+#     )
+# }
+# Create a 2x2 grid of subplots
+fig, axs = plt.subplots(2, 2, figsize=(8, 8))
+# Flatten the axs array for easy iteration
+axs = axs.flatten()
+for ax, name in zip(axs, sobol_reg_target_names):
+    # Get the true and predicted values for this model
+    true_values = sobol_reg[name]
+    predicted_values = cv_predictions[name]
+    # Create the hexbin plot with log scaling
+    hb = ax.hexbin(
+        true_values, predicted_values, gridsize=50, cmap="viridis", bins="log"
+    )
+    cb = plt.colorbar(hb, ax=ax)
+    cb.set_label("counts (log scale)")
+    ax.plot(
+        [true_values.min(), true_values.max()],
+        [true_values.min(), true_values.max()],
+        "w--",
+    )
+    ax.set_xlabel("True Values")
+    ax.set_ylabel("Predicted Values")
+    ax.set_title(f"Parity Plot for {name}")
+    # Set the aspect ratio to be equal
+    ax.set_aspect("equal")
+# Adjust the layout and show the plot
+plt.tight_layout()
+# Save the plot with the updated filename
+plot_filename = f"parity_plot_{model_type}_{opt_status}.png"
+plt.savefig(path.join(model_dir, plot_filename), dpi=300)
+plt.show()
+1 + 1
+# %% Code Graveyard
+# # Compute cross-validated score
+# cv_score = cross_val_score(
+#     model, X1, y1, cv=5, scoring="neg_mean_squared_error"
+# )
+# cv_scores[name1] = np.sqrt(np.abs(cv_score.mean()))