Spaces:

AccelerationConsortium
/

crabnet-hyperparameter

Running

App Files Files Community

crabnet-hyperparameter / train_surrogate.py

sgbaird

Refactor evaluate function in app.py to include parameter scaling and unscaled evaluation

58815da 9 months ago

raw

history blame

10.3 kB

	import time
	import joblib
	from os import path
	from pathlib import Path
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	# from joblib import Parallel, delayed

	from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
	from sklearn.metrics import mean_squared_error
	from sklearn.model_selection import RandomizedSearchCV
	from sklearn.model_selection import KFold

	from scipy.stats import uniform, randint

	model_type = "hgbr" # "hgbr" or "rfr"
	optimize_hyperparameters = True
	dummy = False
	n_jobs = -1 # Number of jobs to run in parallel. -1 means using all processors.

	data_dir = "."
	model_dir = "models"

	assert model_type in [
	"hgbr",
	"rfr",
	], f"Invalid model type: {model_type}, must be 'hgbr' or 'rfr'"

	if dummy:
	model_dir = path.join(model_dir, "dummy")

	Path(model_dir).mkdir(exist_ok=True, parents=True)

	sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

	if dummy:
	data_dir = path.join(data_dir, "dummy")
	sobol_reg = sobol_reg.head(100)

	Path(data_dir).mkdir(exist_ok=True, parents=True)

	elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
	hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")

	sobol_reg["use_RobustL1"] = sobol_reg["criterion"] == "RobustL1"

	sobol_reg["bias"] = sobol_reg["bias"].astype(int)

	sobol_reg = pd.concat([sobol_reg, elemprop_ohe], axis=1)

	common_features = [
	"N",
	"alpha",
	"d_model",
	"dim_feedforward",
	"dropout",
	"emb_scaler",
	"eps",
	"epochs_step",
	"fudge",
	"heads",
	"k",
	"lr",
	"pe_resolution",
	"ple_resolution",
	"pos_scaler",
	"weight_decay",
	"batch_size",
	"out_hidden4",
	"betas1",
	"betas2",
	"train_frac",
	"bias",
	"use_RobustL1",
	"elem_prop_magpie",
	"elem_prop_mat2vec",
	"elem_prop_onehot",
	]


	mae_features = common_features + ["mae_rank"]
	X_array_mae = sobol_reg[mae_features]
	y_array_mae = sobol_reg[["mae"]]
	mae_model_stem = path.join(model_dir, "sobol_reg_mae")

	rmse_features = common_features + ["rmse_rank"]
	X_array_rmse = sobol_reg[rmse_features]
	y_array_rmse = sobol_reg[["rmse"]]
	rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")

	# no model_size_rank because model_size is deterministic via
	# `crabnet.utils.utils.count_parameters`
	model_size_features = common_features
	X_array_model_size = sobol_reg[model_size_features]
	y_array_model_size = sobol_reg[["model_size"]]
	model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")

	runtime_features = common_features + ["runtime_rank"]
	X_array_runtime = sobol_reg[runtime_features]
	y_array_runtime = sobol_reg[["runtime"]]
	runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")


	def train_and_save(
	sr_feat_array,
	sr_labels_array,
	sr_label_names,
	optimize_hyperparameters=False,
	):
	models = {}
	timings = {}
	# cv_scores = []
	avg_cv_scores = {}
	cv_predictions = {}

	for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names):
	y1 = y1.squeeze()
	print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}")

	if model_type == "rfr":
	model = RandomForestRegressor(random_state=13)
	elif model_type == "hgbr":
	model = HistGradientBoostingRegressor(random_state=13)

	if optimize_hyperparameters:
	# define hyperparameters to tune
	if model.__class__.__name__ == "HistGradientBoostingRegressor":
	param_dist = {
	"max_iter": randint(100, 200),
	"max_leaf_nodes": [None, 30, 50],
	"learning_rate": uniform(0.01, 0.1),
	# Add more hyperparameters here as needed
	}
	elif model.__class__.__name__ == "RandomForestRegressor":
	param_dist = {
	"n_estimators": randint(100, 200),
	"max_features": ["auto", "sqrt"],
	"max_depth": randint(10, 50),
	"min_samples_split": randint(2, 10),
	# Add more hyperparameters here as needed
	}

	# Use RandomizedSearchCV to tune the hyperparameters
	random_search = RandomizedSearchCV(
	model,
	param_dist,
	n_iter=10,
	cv=5,
	scoring="neg_mean_squared_error",
	random_state=13,
	n_jobs=n_jobs,
	)

	start_time = time.time()
	# REVIEW: use y1.values.ravel() instead of y1 to flatten y1 to a 1D array
	random_search.fit(X1, y1)
	end_time = time.time()

	# Use the best estimator found by RandomizedSearchCV
	model = random_search.best_estimator_
	timings[name1] = end_time - start_time
	else:
	start_time = time.time()
	model.fit(X1, y1)
	end_time = time.time()
	timings[name1] = end_time - start_time

	print(f"Trained {name1} in {timings[name1]} seconds")

	# Perform cross-validation manually to keep track of predictions
	# NOTE: This doesn't use GroupKFold, which would prevent cross-leakage for the rank column
	# cv = KFold(n_splits=5)
	# cv_preds = []
	# for train_index, test_index in cv.split(X1):
	# X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
	# y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
	# model.fit(X_train, y_train)
	# preds = model.predict(X_test)
	# cv_preds.extend(preds)
	# cv_scores.append(mean_squared_error(y_test, preds))
	# avg_cv_scores[name1] = np.sqrt(np.mean(cv_scores))
	# cv_predictions[name1] = cv_preds

	def cross_validate(X1, y1, model):
	cv = KFold(n_splits=5)
	cv_preds = []
	cv_scores = []
	for train_index, test_index in cv.split(X1):
	X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
	y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
	model.fit(X_train, y_train)
	preds = model.predict(X_test)
	cv_preds.extend(preds)
	cv_scores.append(mean_squared_error(y_test, preds))
	return cv_preds, np.sqrt(np.mean(cv_scores))

	cv_predictions[name1], avg_cv_scores[name1] = cross_validate(X1, y1, model)

	# # Parallelize the outer loop
	# results = Parallel(n_jobs=n_jobs)(
	# delayed(cross_validate)(X1, y1, model)
	# for X1, y1 in zip(sr_feat_array, sr_labels_array)
	# )

	# # Unpack the results
	# cv_predictions, avg_cv_scores = zip(*results)

	# # Convert the results to dictionaries
	# cv_predictions = dict(zip(sobol_reg_target_names, cv_predictions))
	# avg_cv_scores = dict(zip(sobol_reg_target_names, avg_cv_scores))

	print(f"Cross-validated score for {name1}: {avg_cv_scores[name1]}")

	models[name1] = model

	print()

	return models, timings, avg_cv_scores, cv_predictions


	# List of x_arrays, y_arrays, and target_names
	sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime]
	sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime]
	sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"]

	# Train and save the model on all the data
	models, timings, avg_cv_scores, cv_predictions = train_and_save(
	sobol_reg_x_arrays,
	sobol_reg_labels,
	sobol_reg_target_names,
	optimize_hyperparameters=optimize_hyperparameters, # if true, probably ~16 min for iter=5 & cv=3
	)

	print(f"Timings (in seconds): {timings}") # doesn't include cross_val_score runtime
	print(f"Cross-validated scores: {avg_cv_scores}")

	# Save timings and cv_scores to a CSV file
	results = pd.DataFrame(
	{
	"Model": list(timings.keys()),
	"Timing": list(timings.values()),
	"CV Score": list(avg_cv_scores.values()),
	}
	)

	# Determine the model type and optimization status
	model_type = (
	"hgbr"
	if isinstance(next(iter(models.values())), HistGradientBoostingRegressor)
	else "rfr"
	)
	opt_status = "opt" if optimize_hyperparameters else "no_opt"

	# Save the results and models with the updated filenames
	results_filename = f"model_results_{model_type}_{opt_status}.csv"
	models_filename = f"surrogate_models_{model_type}_{opt_status}.pkl"

	results.to_csv(path.join(model_dir, results_filename), index=False)
	joblib.dump(models, path.join(model_dir, models_filename), compress=7)

	# NOTE: Can use this if looking at how well it memorizes the training data
	# # Generate predictions for each model
	# predictions = {
	# name: model.predict(X)
	# for name, model, X in zip(
	# sobol_reg_target_names, models.values(), sobol_reg_x_arrays
	# )
	# }

	# Create a 2x2 grid of subplots
	fig, axs = plt.subplots(2, 2, figsize=(8, 8))

	# Flatten the axs array for easy iteration
	axs = axs.flatten()

	for ax, name in zip(axs, sobol_reg_target_names):
	# Get the true and predicted values for this model
	true_values = sobol_reg[name]
	predicted_values = cv_predictions[name]

	# Create the hexbin plot with log scaling
	hb = ax.hexbin(
	true_values, predicted_values, gridsize=50, cmap="viridis", bins="log"
	)
	cb = plt.colorbar(hb, ax=ax)
	cb.set_label("counts (log scale)")

	ax.plot(
	[true_values.min(), true_values.max()],
	[true_values.min(), true_values.max()],
	"w--",
	)
	ax.set_xlabel("True Values")
	ax.set_ylabel("Predicted Values")
	ax.set_title(f"Parity Plot for {name}")

	# Set the aspect ratio to be equal
	ax.set_aspect("equal")

	# Adjust the layout and show the plot
	plt.tight_layout()

	# Save the plot with the updated filename
	plot_filename = f"parity_plot_{model_type}_{opt_status}.png"
	plt.savefig(path.join(model_dir, plot_filename), dpi=300)

	plt.show()

	1 + 1


	# %% Code Graveyard

	# # Compute cross-validated score
	# cv_score = cross_val_score(
	# model, X1, y1, cv=5, scoring="neg_mean_squared_error"
	# )
	# cv_scores[name1] = np.sqrt(np.abs(cv_score.mean()))