encrypted_credit_scoring / development.py
romanbredehoft-zama's picture
Better input/target data split in development
cb3c1a3
raw history blame
No virus
3.69 kB
"""Train and compile the model."""
import shutil
import numpy
import pandas
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from settings import (
DEPLOYMENT_PATH,
RANDOM_STATE,
DATA_PATH,
INPUT_SLICES,
PRE_PROCESSOR_USER_PATH,
PRE_PROCESSOR_THIRD_PARTY_PATH,
USER_COLUMNS,
BANK_COLUMNS,
THIRD_PARTY_COLUMNS,
)
from utils.client_server_interface import MultiInputsFHEModelDev
from utils.model import MultiInputXGBClassifier
from utils.pre_processing import get_pre_processors
def get_processed_multi_inputs(data):
return (
data[:, INPUT_SLICES["user"]],
data[:, INPUT_SLICES["bank"]],
data[:, INPUT_SLICES["third_party"]]
)
print("Load and pre-process the data")
# Original data set can be found here :
# https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction/data
# It then has been cleaned using the following notebook :
# https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
# A few additional pre-processing steps has bee applied to this data set as well :
# - "ID" column has been removed
# - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
# salary one from 2023 (22050 euros)
data = pandas.read_csv(DATA_PATH, encoding="utf-8")
# Define input and target data
data_x = data.copy()
data_y = data_x.pop("Target").copy()
# Get data from all parties
data_user = data_x[USER_COLUMNS].copy()
data_bank = data_x[BANK_COLUMNS].copy()
data_third_party = data_x[THIRD_PARTY_COLUMNS].copy()
# Feature engineer the data
pre_processor_user, pre_processor_third_party = get_pre_processors()
preprocessed_data_user = pre_processor_user.fit_transform(data_user)
preprocessed_data_bank = data_bank.to_numpy()
preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party)
preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1)
# The initial data-set is very imbalanced: use SMOTE to get better results
x, y = SMOTE().fit_resample(preprocessed_data_x, data_y)
# Retrieve the training and testing data
X_train, X_test, y_train, y_test = train_test_split(
x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE
)
print("\nTrain and compile the model")
model = MultiInputXGBClassifier(max_depth=3, n_estimators=40)
model, sklearn_model = model.fit_benchmark(X_train, y_train)
multi_inputs_train = get_processed_multi_inputs(X_train)
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])
# Delete the deployment folder and its content if it already exists
if DEPLOYMENT_PATH.is_dir():
shutil.rmtree(DEPLOYMENT_PATH)
print("\nEvaluate the models")
y_pred_sklearn = sklearn_model.predict(X_test)
print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%")
multi_inputs_test = get_processed_multi_inputs(X_test)
y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True)
print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%")
print("\nSave deployment files")
# Save files needed for deployment (and enable cross-platform deployment)
fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model)
fhe_dev.save(via_mlir=True)
# Save pre-processors
with PRE_PROCESSOR_USER_PATH.open('wb') as file:
pickle.dump(pre_processor_user, file)
with PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file:
pickle.dump(pre_processor_third_party, file)
print("\nDone !")