Spaces:

AccelerationConsortium
/

crabnet-hyperparameter

Running

File size: 10,257 Bytes

58815da

import time
import joblib
from os import path
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from joblib import Parallel, delayed

from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

from scipy.stats import uniform, randint

model_type = "hgbr"  # "hgbr" or "rfr"
optimize_hyperparameters = True
dummy = False
n_jobs = -1  # Number of jobs to run in parallel. -1 means using all processors.

data_dir = "."
model_dir = "models"

assert model_type in [
    "hgbr",
    "rfr",
], f"Invalid model type: {model_type}, must be 'hgbr' or 'rfr'"

if dummy:
    model_dir = path.join(model_dir, "dummy")

Path(model_dir).mkdir(exist_ok=True, parents=True)

sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    data_dir = path.join(data_dir, "dummy")
    sobol_reg = sobol_reg.head(100)

Path(data_dir).mkdir(exist_ok=True, parents=True)

elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")

sobol_reg["use_RobustL1"] = sobol_reg["criterion"] == "RobustL1"

sobol_reg["bias"] = sobol_reg["bias"].astype(int)

sobol_reg = pd.concat([sobol_reg, elemprop_ohe], axis=1)

common_features = [
    "N",
    "alpha",
    "d_model",
    "dim_feedforward",
    "dropout",
    "emb_scaler",
    "eps",
    "epochs_step",
    "fudge",
    "heads",
    "k",
    "lr",
    "pe_resolution",
    "ple_resolution",
    "pos_scaler",
    "weight_decay",
    "batch_size",
    "out_hidden4",
    "betas1",
    "betas2",
    "train_frac",
    "bias",
    "use_RobustL1",
    "elem_prop_magpie",
    "elem_prop_mat2vec",
    "elem_prop_onehot",
]


mae_features = common_features + ["mae_rank"]
X_array_mae = sobol_reg[mae_features]
y_array_mae = sobol_reg[["mae"]]
mae_model_stem = path.join(model_dir, "sobol_reg_mae")

rmse_features = common_features + ["rmse_rank"]
X_array_rmse = sobol_reg[rmse_features]
y_array_rmse = sobol_reg[["rmse"]]
rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")

# no model_size_rank because model_size is deterministic via
# `crabnet.utils.utils.count_parameters`
model_size_features = common_features
X_array_model_size = sobol_reg[model_size_features]
y_array_model_size = sobol_reg[["model_size"]]
model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")

runtime_features = common_features + ["runtime_rank"]
X_array_runtime = sobol_reg[runtime_features]
y_array_runtime = sobol_reg[["runtime"]]
runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")


def train_and_save(
    sr_feat_array,
    sr_labels_array,
    sr_label_names,
    optimize_hyperparameters=False,
):
    models = {}
    timings = {}
    # cv_scores = []
    avg_cv_scores = {}
    cv_predictions = {}

    for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names):
        y1 = y1.squeeze()
        print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}")

        if model_type == "rfr":
            model = RandomForestRegressor(random_state=13)
        elif model_type == "hgbr":
            model = HistGradientBoostingRegressor(random_state=13)

        if optimize_hyperparameters:
            # define hyperparameters to tune
            if model.__class__.__name__ == "HistGradientBoostingRegressor":
                param_dist = {
                    "max_iter": randint(100, 200),
                    "max_leaf_nodes": [None, 30, 50],
                    "learning_rate": uniform(0.01, 0.1),
                    # Add more hyperparameters here as needed
                }
            elif model.__class__.__name__ == "RandomForestRegressor":
                param_dist = {
                    "n_estimators": randint(100, 200),
                    "max_features": ["auto", "sqrt"],
                    "max_depth": randint(10, 50),
                    "min_samples_split": randint(2, 10),
                    # Add more hyperparameters here as needed
                }

            # Use RandomizedSearchCV to tune the hyperparameters
            random_search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=10,
                cv=5,
                scoring="neg_mean_squared_error",
                random_state=13,
                n_jobs=n_jobs,
            )

            start_time = time.time()
            # REVIEW: use y1.values.ravel() instead of y1 to flatten y1 to a 1D array
            random_search.fit(X1, y1)
            end_time = time.time()

            # Use the best estimator found by RandomizedSearchCV
            model = random_search.best_estimator_
            timings[name1] = end_time - start_time
        else:
            start_time = time.time()
            model.fit(X1, y1)
            end_time = time.time()
            timings[name1] = end_time - start_time

        print(f"Trained {name1} in {timings[name1]} seconds")

        # Perform cross-validation manually to keep track of predictions
        # NOTE: This doesn't use GroupKFold, which would prevent cross-leakage for the rank column
        # cv = KFold(n_splits=5)
        # cv_preds = []
        # for train_index, test_index in cv.split(X1):
        #     X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
        #     y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
        #     model.fit(X_train, y_train)
        #     preds = model.predict(X_test)
        #     cv_preds.extend(preds)
        #     cv_scores.append(mean_squared_error(y_test, preds))
        # avg_cv_scores[name1] = np.sqrt(np.mean(cv_scores))
        # cv_predictions[name1] = cv_preds

        def cross_validate(X1, y1, model):
            cv = KFold(n_splits=5)
            cv_preds = []
            cv_scores = []
            for train_index, test_index in cv.split(X1):
                X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
                y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                cv_preds.extend(preds)
                cv_scores.append(mean_squared_error(y_test, preds))
            return cv_preds, np.sqrt(np.mean(cv_scores))

        cv_predictions[name1], avg_cv_scores[name1] = cross_validate(X1, y1, model)

        # # Parallelize the outer loop
        # results = Parallel(n_jobs=n_jobs)(
        #     delayed(cross_validate)(X1, y1, model)
        #     for X1, y1 in zip(sr_feat_array, sr_labels_array)
        # )

        # # Unpack the results
        # cv_predictions, avg_cv_scores = zip(*results)

        # # Convert the results to dictionaries
        # cv_predictions = dict(zip(sobol_reg_target_names, cv_predictions))
        # avg_cv_scores = dict(zip(sobol_reg_target_names, avg_cv_scores))

        print(f"Cross-validated score for {name1}: {avg_cv_scores[name1]}")

        models[name1] = model

        print()

    return models, timings, avg_cv_scores, cv_predictions


# List of x_arrays, y_arrays, and target_names
sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime]
sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime]
sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"]

# Train and save the model on all the data
models, timings, avg_cv_scores, cv_predictions = train_and_save(
    sobol_reg_x_arrays,
    sobol_reg_labels,
    sobol_reg_target_names,
    optimize_hyperparameters=optimize_hyperparameters,  # if true, probably ~16 min for iter=5 & cv=3
)

print(f"Timings (in seconds): {timings}")  # doesn't include cross_val_score runtime
print(f"Cross-validated scores: {avg_cv_scores}")

# Save timings and cv_scores to a CSV file
results = pd.DataFrame(
    {
        "Model": list(timings.keys()),
        "Timing": list(timings.values()),
        "CV Score": list(avg_cv_scores.values()),
    }
)

# Determine the model type and optimization status
model_type = (
    "hgbr"
    if isinstance(next(iter(models.values())), HistGradientBoostingRegressor)
    else "rfr"
)
opt_status = "opt" if optimize_hyperparameters else "no_opt"

# Save the results and models with the updated filenames
results_filename = f"model_results_{model_type}_{opt_status}.csv"
models_filename = f"surrogate_models_{model_type}_{opt_status}.pkl"

results.to_csv(path.join(model_dir, results_filename), index=False)
joblib.dump(models, path.join(model_dir, models_filename), compress=7)

# NOTE: Can use this if looking at how well it memorizes the training data
# # Generate predictions for each model
# predictions = {
#     name: model.predict(X)
#     for name, model, X in zip(
#         sobol_reg_target_names, models.values(), sobol_reg_x_arrays
#     )
# }

# Create a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(8, 8))

# Flatten the axs array for easy iteration
axs = axs.flatten()

for ax, name in zip(axs, sobol_reg_target_names):
    # Get the true and predicted values for this model
    true_values = sobol_reg[name]
    predicted_values = cv_predictions[name]

    # Create the hexbin plot with log scaling
    hb = ax.hexbin(
        true_values, predicted_values, gridsize=50, cmap="viridis", bins="log"
    )
    cb = plt.colorbar(hb, ax=ax)
    cb.set_label("counts (log scale)")

    ax.plot(
        [true_values.min(), true_values.max()],
        [true_values.min(), true_values.max()],
        "w--",
    )
    ax.set_xlabel("True Values")
    ax.set_ylabel("Predicted Values")
    ax.set_title(f"Parity Plot for {name}")

    # Set the aspect ratio to be equal
    ax.set_aspect("equal")

# Adjust the layout and show the plot
plt.tight_layout()

# Save the plot with the updated filename
plot_filename = f"parity_plot_{model_type}_{opt_status}.png"
plt.savefig(path.join(model_dir, plot_filename), dpi=300)

plt.show()

1 + 1


# %% Code Graveyard

# # Compute cross-validated score
# cv_score = cross_val_score(
#     model, X1, y1, cv=5, scoring="neg_mean_squared_error"
# )
# cv_scores[name1] = np.sqrt(np.abs(cv_score.mean()))