romanbredehoft-zama's picture
Adding deployment files and updating app
c119738
raw
history blame
1.85 kB
"A script to generate all development files necessary for the project."
import shutil
import numpy
import pandas
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from ..settings import DEPLOYMENT_PATH, RANDOM_STATE
from client_server_interface import MultiInputsFHEModelDev
from model import MultiInputXGBClassifier
from development.pre_processing import pre_process_data
print("Load and pre-process the data")
data = pandas.read_csv("data/clean_data.csv", encoding="utf-8")
# Make median annual salary similar to France (2023): from 157500 to 22050
data["Total_income"] = data["Total_income"] * 0.14
# Remove ID feature
data.drop("ID", axis=1, inplace=True)
# Feature engineer the data
pre_processed_data, training_bins = pre_process_data(data)
# Define input and target data
y = pre_processed_data.pop("Target")
x = pre_processed_data
# The initial data-set is very imbalanced: use SMOTE to get better results
x, y = SMOTE().fit_resample(x, y)
# Retrieve the training data
X_train, _, y_train, _ = train_test_split(
x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE
)
# Convert the Pandas data frames into Numpy arrays
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
print("Train and compile the model")
model = MultiInputXGBClassifier(max_depth=3, n_estimators=40)
model.fit(X_train_np, y_train_np)
multi_inputs_train = numpy.array_split(X_train_np, 3, axis=1)
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])
# Delete the deployment folder and its content if it already exists
if DEPLOYMENT_PATH.is_dir():
shutil.rmtree(DEPLOYMENT_PATH)
print("Save deployment files")
# Save the files needed for deployment
fhe_dev = MultiInputsFHEModelDev(model, DEPLOYMENT_PATH)
fhe_dev.save()
print("Done !")