Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
"""Train and compile the model.""" | |
import shutil | |
import numpy | |
import pandas | |
import pickle | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
from imblearn.over_sampling import SMOTE | |
from settings import ( | |
DEPLOYMENT_PATH, | |
RANDOM_STATE, | |
DATA_PATH, | |
INPUT_SLICES, | |
PRE_PROCESSOR_USER_PATH, | |
PRE_PROCESSOR_THIRD_PARTY_PATH, | |
USER_COLUMNS, | |
BANK_COLUMNS, | |
THIRD_PARTY_COLUMNS, | |
) | |
from utils.client_server_interface import MultiInputsFHEModelDev | |
from utils.model import MultiInputXGBClassifier | |
from utils.pre_processing import get_pre_processors | |
def get_processed_multi_inputs(data): | |
return ( | |
data[:, INPUT_SLICES["user"]], | |
data[:, INPUT_SLICES["bank"]], | |
data[:, INPUT_SLICES["third_party"]] | |
) | |
print("Load and pre-process the data") | |
# Original data set can be found here : | |
# https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction/data | |
# It then has been cleaned using the following notebook : | |
# https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning | |
# A few additional pre-processing steps has bee applied to this data set as well : | |
# - "ID" column has been removed | |
# - "Total_income" values have been multiplied by 0.14 to make its median match France's annual | |
# salary one from 2023 (22050 euros) | |
data = pandas.read_csv(DATA_PATH, encoding="utf-8") | |
# Define input and target data | |
data_y = data.pop("Target").copy() | |
data_x = data.copy() | |
# Get data from all parties | |
data_user = data_x[USER_COLUMNS].copy() | |
data_bank = data_x[BANK_COLUMNS].copy() | |
data_third_party = data_x[THIRD_PARTY_COLUMNS].copy() | |
# Feature engineer the data | |
pre_processor_user, pre_processor_third_party = get_pre_processors() | |
preprocessed_data_user = pre_processor_user.fit_transform(data_user) | |
preprocessed_data_bank = data_bank.to_numpy() | |
preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party) | |
preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1) | |
# The initial data-set is very imbalanced: use SMOTE to get better results | |
x, y = SMOTE().fit_resample(preprocessed_data_x, data_y) | |
# Retrieve the training and testing data | |
X_train, X_test, y_train, y_test = train_test_split( | |
x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE | |
) | |
print("\nTrain and compile the model") | |
model = MultiInputXGBClassifier(max_depth=3, n_estimators=40) | |
model, sklearn_model = model.fit_benchmark(X_train, y_train) | |
multi_inputs_train = get_processed_multi_inputs(X_train) | |
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"]) | |
# Delete the deployment folder and its content if it already exists | |
if DEPLOYMENT_PATH.is_dir(): | |
shutil.rmtree(DEPLOYMENT_PATH) | |
print("\nEvaluate the models") | |
y_pred_sklearn = sklearn_model.predict(X_test) | |
print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%") | |
multi_inputs_test = get_processed_multi_inputs(X_test) | |
y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True) | |
print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%") | |
print("\nSave deployment files") | |
# Save files needed for deployment (and enable cross-platform deployment) | |
fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model) | |
fhe_dev.save(via_mlir=True) | |
# Save pre-processors | |
with PRE_PROCESSOR_USER_PATH.open('wb') as file: | |
pickle.dump(pre_processor_user, file) | |
with PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file: | |
pickle.dump(pre_processor_third_party, file) | |
print("\nDone !") | |