File size: 10,257 Bytes
58815da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
import time
import joblib
from os import path
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from joblib import Parallel, delayed

from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

from scipy.stats import uniform, randint

model_type = "hgbr"  # "hgbr" or "rfr"
optimize_hyperparameters = True
dummy = False
n_jobs = -1  # Number of jobs to run in parallel. -1 means using all processors.

data_dir = "."
model_dir = "models"

assert model_type in [
    "hgbr",
    "rfr",
], f"Invalid model type: {model_type}, must be 'hgbr' or 'rfr'"

if dummy:
    model_dir = path.join(model_dir, "dummy")

Path(model_dir).mkdir(exist_ok=True, parents=True)

sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    data_dir = path.join(data_dir, "dummy")
    sobol_reg = sobol_reg.head(100)

Path(data_dir).mkdir(exist_ok=True, parents=True)

elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")

sobol_reg["use_RobustL1"] = sobol_reg["criterion"] == "RobustL1"

sobol_reg["bias"] = sobol_reg["bias"].astype(int)

sobol_reg = pd.concat([sobol_reg, elemprop_ohe], axis=1)

common_features = [
    "N",
    "alpha",
    "d_model",
    "dim_feedforward",
    "dropout",
    "emb_scaler",
    "eps",
    "epochs_step",
    "fudge",
    "heads",
    "k",
    "lr",
    "pe_resolution",
    "ple_resolution",
    "pos_scaler",
    "weight_decay",
    "batch_size",
    "out_hidden4",
    "betas1",
    "betas2",
    "train_frac",
    "bias",
    "use_RobustL1",
    "elem_prop_magpie",
    "elem_prop_mat2vec",
    "elem_prop_onehot",
]


mae_features = common_features + ["mae_rank"]
X_array_mae = sobol_reg[mae_features]
y_array_mae = sobol_reg[["mae"]]
mae_model_stem = path.join(model_dir, "sobol_reg_mae")

rmse_features = common_features + ["rmse_rank"]
X_array_rmse = sobol_reg[rmse_features]
y_array_rmse = sobol_reg[["rmse"]]
rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")

# no model_size_rank because model_size is deterministic via
# `crabnet.utils.utils.count_parameters`
model_size_features = common_features
X_array_model_size = sobol_reg[model_size_features]
y_array_model_size = sobol_reg[["model_size"]]
model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")

runtime_features = common_features + ["runtime_rank"]
X_array_runtime = sobol_reg[runtime_features]
y_array_runtime = sobol_reg[["runtime"]]
runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")


def train_and_save(
    sr_feat_array,
    sr_labels_array,
    sr_label_names,
    optimize_hyperparameters=False,
):
    models = {}
    timings = {}
    # cv_scores = []
    avg_cv_scores = {}
    cv_predictions = {}

    for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names):
        y1 = y1.squeeze()
        print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}")

        if model_type == "rfr":
            model = RandomForestRegressor(random_state=13)
        elif model_type == "hgbr":
            model = HistGradientBoostingRegressor(random_state=13)

        if optimize_hyperparameters:
            # define hyperparameters to tune
            if model.__class__.__name__ == "HistGradientBoostingRegressor":
                param_dist = {
                    "max_iter": randint(100, 200),
                    "max_leaf_nodes": [None, 30, 50],
                    "learning_rate": uniform(0.01, 0.1),
                    # Add more hyperparameters here as needed
                }
            elif model.__class__.__name__ == "RandomForestRegressor":
                param_dist = {
                    "n_estimators": randint(100, 200),
                    "max_features": ["auto", "sqrt"],
                    "max_depth": randint(10, 50),
                    "min_samples_split": randint(2, 10),
                    # Add more hyperparameters here as needed
                }

            # Use RandomizedSearchCV to tune the hyperparameters
            random_search = RandomizedSearchCV(
                model,
                param_dist,
                n_iter=10,
                cv=5,
                scoring="neg_mean_squared_error",
                random_state=13,
                n_jobs=n_jobs,
            )

            start_time = time.time()
            # REVIEW: use y1.values.ravel() instead of y1 to flatten y1 to a 1D array
            random_search.fit(X1, y1)
            end_time = time.time()

            # Use the best estimator found by RandomizedSearchCV
            model = random_search.best_estimator_
            timings[name1] = end_time - start_time
        else:
            start_time = time.time()
            model.fit(X1, y1)
            end_time = time.time()
            timings[name1] = end_time - start_time

        print(f"Trained {name1} in {timings[name1]} seconds")

        # Perform cross-validation manually to keep track of predictions
        # NOTE: This doesn't use GroupKFold, which would prevent cross-leakage for the rank column
        # cv = KFold(n_splits=5)
        # cv_preds = []
        # for train_index, test_index in cv.split(X1):
        #     X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
        #     y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
        #     model.fit(X_train, y_train)
        #     preds = model.predict(X_test)
        #     cv_preds.extend(preds)
        #     cv_scores.append(mean_squared_error(y_test, preds))
        # avg_cv_scores[name1] = np.sqrt(np.mean(cv_scores))
        # cv_predictions[name1] = cv_preds

        def cross_validate(X1, y1, model):
            cv = KFold(n_splits=5)
            cv_preds = []
            cv_scores = []
            for train_index, test_index in cv.split(X1):
                X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
                y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                cv_preds.extend(preds)
                cv_scores.append(mean_squared_error(y_test, preds))
            return cv_preds, np.sqrt(np.mean(cv_scores))

        cv_predictions[name1], avg_cv_scores[name1] = cross_validate(X1, y1, model)

        # # Parallelize the outer loop
        # results = Parallel(n_jobs=n_jobs)(
        #     delayed(cross_validate)(X1, y1, model)
        #     for X1, y1 in zip(sr_feat_array, sr_labels_array)
        # )

        # # Unpack the results
        # cv_predictions, avg_cv_scores = zip(*results)

        # # Convert the results to dictionaries
        # cv_predictions = dict(zip(sobol_reg_target_names, cv_predictions))
        # avg_cv_scores = dict(zip(sobol_reg_target_names, avg_cv_scores))

        print(f"Cross-validated score for {name1}: {avg_cv_scores[name1]}")

        models[name1] = model

        print()

    return models, timings, avg_cv_scores, cv_predictions


# List of x_arrays, y_arrays, and target_names
sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime]
sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime]
sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"]

# Train and save the model on all the data
models, timings, avg_cv_scores, cv_predictions = train_and_save(
    sobol_reg_x_arrays,
    sobol_reg_labels,
    sobol_reg_target_names,
    optimize_hyperparameters=optimize_hyperparameters,  # if true, probably ~16 min for iter=5 & cv=3
)

print(f"Timings (in seconds): {timings}")  # doesn't include cross_val_score runtime
print(f"Cross-validated scores: {avg_cv_scores}")

# Save timings and cv_scores to a CSV file
results = pd.DataFrame(
    {
        "Model": list(timings.keys()),
        "Timing": list(timings.values()),
        "CV Score": list(avg_cv_scores.values()),
    }
)

# Determine the model type and optimization status
model_type = (
    "hgbr"
    if isinstance(next(iter(models.values())), HistGradientBoostingRegressor)
    else "rfr"
)
opt_status = "opt" if optimize_hyperparameters else "no_opt"

# Save the results and models with the updated filenames
results_filename = f"model_results_{model_type}_{opt_status}.csv"
models_filename = f"surrogate_models_{model_type}_{opt_status}.pkl"

results.to_csv(path.join(model_dir, results_filename), index=False)
joblib.dump(models, path.join(model_dir, models_filename), compress=7)

# NOTE: Can use this if looking at how well it memorizes the training data
# # Generate predictions for each model
# predictions = {
#     name: model.predict(X)
#     for name, model, X in zip(
#         sobol_reg_target_names, models.values(), sobol_reg_x_arrays
#     )
# }

# Create a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(8, 8))

# Flatten the axs array for easy iteration
axs = axs.flatten()

for ax, name in zip(axs, sobol_reg_target_names):
    # Get the true and predicted values for this model
    true_values = sobol_reg[name]
    predicted_values = cv_predictions[name]

    # Create the hexbin plot with log scaling
    hb = ax.hexbin(
        true_values, predicted_values, gridsize=50, cmap="viridis", bins="log"
    )
    cb = plt.colorbar(hb, ax=ax)
    cb.set_label("counts (log scale)")

    ax.plot(
        [true_values.min(), true_values.max()],
        [true_values.min(), true_values.max()],
        "w--",
    )
    ax.set_xlabel("True Values")
    ax.set_ylabel("Predicted Values")
    ax.set_title(f"Parity Plot for {name}")

    # Set the aspect ratio to be equal
    ax.set_aspect("equal")

# Adjust the layout and show the plot
plt.tight_layout()

# Save the plot with the updated filename
plot_filename = f"parity_plot_{model_type}_{opt_status}.png"
plt.savefig(path.join(model_dir, plot_filename), dpi=300)

plt.show()

1 + 1


# %% Code Graveyard

# # Compute cross-validated score
# cv_score = cross_val_score(
#     model, X1, y1, cv=5, scoring="neg_mean_squared_error"
# )
# cv_scores[name1] = np.sqrt(np.abs(cv_score.mean()))