Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
"A script to generate all development files necessary for the project." | |
import shutil | |
import numpy | |
import pandas | |
from sklearn.model_selection import train_test_split | |
from imblearn.over_sampling import SMOTE | |
from ..settings import DEPLOYMENT_PATH, RANDOM_STATE | |
from client_server_interface import MultiInputsFHEModelDev | |
from model import MultiInputXGBClassifier | |
from development.pre_processing import pre_process_data | |
print("Load and pre-process the data") | |
data = pandas.read_csv("data/clean_data.csv", encoding="utf-8") | |
# Make median annual salary similar to France (2023): from 157500 to 22050 | |
data["Total_income"] = data["Total_income"] * 0.14 | |
# Remove ID feature | |
data.drop("ID", axis=1, inplace=True) | |
# Feature engineer the data | |
pre_processed_data, training_bins = pre_process_data(data) | |
# Define input and target data | |
y = pre_processed_data.pop("Target") | |
x = pre_processed_data | |
# The initial data-set is very imbalanced: use SMOTE to get better results | |
x, y = SMOTE().fit_resample(x, y) | |
# Retrieve the training data | |
X_train, _, y_train, _ = train_test_split( | |
x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE | |
) | |
# Convert the Pandas data frames into Numpy arrays | |
X_train_np = X_train.to_numpy() | |
y_train_np = y_train.to_numpy() | |
print("Train and compile the model") | |
model = MultiInputXGBClassifier(max_depth=3, n_estimators=40) | |
model.fit(X_train_np, y_train_np) | |
multi_inputs_train = numpy.array_split(X_train_np, 3, axis=1) | |
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"]) | |
# Delete the deployment folder and its content if it already exists | |
if DEPLOYMENT_PATH.is_dir(): | |
shutil.rmtree(DEPLOYMENT_PATH) | |
print("Save deployment files") | |
# Save the files needed for deployment | |
fhe_dev = MultiInputsFHEModelDev(model, DEPLOYMENT_PATH) | |
fhe_dev.save() | |
print("Done !") | |