"""Train and compile the model.""" import shutil import numpy import pandas import pickle from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from imblearn.over_sampling import SMOTE from settings import DEPLOYMENT_PATH, RANDOM_STATE, DATA_PATH, INPUT_SLICES, PRE_PROCESSOR_USER_PATH, PRE_PROCESSOR_THIRD_PARTY_PATH from utils.client_server_interface import MultiInputsFHEModelDev from utils.model import MultiInputXGBClassifier from utils.pre_processing import get_pre_processors, select_and_pop_features def get_processed_multi_inputs(data): return ( data[:, INPUT_SLICES["user"]], data[:, INPUT_SLICES["bank"]], data[:, INPUT_SLICES["third_party"]] ) print("Load and pre-process the data") # Original data set can be found here : # https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction/data # It then has been cleaned using the following notebook : # https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning # A few additional pre-processing steps has bee applied to this data set as well : # - "ID" column has been removed # - "Total_income" values have been multiplied by 0.14 to make its median match France's annual # salary one from 2023 (22050 euros) data = pandas.read_csv(DATA_PATH, encoding="utf-8") # Define input and target data data_y = data.pop("Target").copy() data_x = data.copy() # Get data from all parties data_third_party = select_and_pop_features(data_x, ["Years_employed", "Salaried"]) data_bank = select_and_pop_features(data_x, ["Account_length"]) data_user = data_x.copy() # Feature engineer the data pre_processor_user, pre_processor_third_party = get_pre_processors() preprocessed_data_user = pre_processor_user.fit_transform(data_user) preprocessed_data_bank = data_bank.to_numpy() preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party) preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1) # The initial data-set is very imbalanced: use SMOTE to get better results x, y = SMOTE().fit_resample(preprocessed_data_x, data_y) # Retrieve the training and testing data X_train, X_test, y_train, y_test = train_test_split( x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE ) print("\nTrain and compile the model") model = MultiInputXGBClassifier(max_depth=3, n_estimators=40) model, sklearn_model = model.fit_benchmark(X_train, y_train) multi_inputs_train = get_processed_multi_inputs(X_train) model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"]) # Delete the deployment folder and its content if it already exists if DEPLOYMENT_PATH.is_dir(): shutil.rmtree(DEPLOYMENT_PATH) print("\nEvaluate the models") y_pred_sklearn = sklearn_model.predict(X_test) print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%") multi_inputs_test = get_processed_multi_inputs(X_test) y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True) print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%") print("\nSave deployment files") # Save files needed for deployment (and enable cross-platform deployment) fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model) fhe_dev.save(via_mlir=True) # Save pre-processors with PRE_PROCESSOR_USER_PATH.open('wb') as file: pickle.dump(pre_processor_user, file) with PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file: pickle.dump(pre_processor_third_party, file) print("\nDone !")