romanbredehoft-zama
commited on
Commit
•
18ba8c1
1
Parent(s):
e3103dd
Update to synthetic data-set
Browse files- app.py +2 -2
- backend.py +26 -18
- data/data.csv +0 -0
- data/gpt_data.csv +57 -0
- deployment_files/client.zip +2 -2
- deployment_files/pre_processor_bank.pkl +3 -0
- deployment_files/pre_processor_third_party.pkl +2 -2
- deployment_files/pre_processor_user.pkl +2 -2
- deployment_files/server.zip +2 -2
- development.py +17 -46
- requirements.txt +1 -2
- settings.py +6 -8
- utils/client_server_interface.py +8 -2
- utils/pre_processing.py +21 -41
app.py
CHANGED
@@ -76,7 +76,7 @@ with demo:
|
|
76 |
with gr.Row():
|
77 |
with gr.Column():
|
78 |
gr.Markdown("### User")
|
79 |
-
bool_inputs = gr.CheckboxGroup(["Car", "Property", "
|
80 |
num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
|
81 |
household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
|
82 |
total_income = gr.Slider(**INCOME_MIN_MAX, label="Income", info="What's you total yearly income (in euros) ?")
|
@@ -132,7 +132,7 @@ with demo:
|
|
132 |
needing to decrypt any value.
|
133 |
|
134 |
This server employs an [XGBoost](https://github.com/dmlc/xgboost) classifier model that has
|
135 |
-
been trained on
|
136 |
"""
|
137 |
)
|
138 |
|
|
|
76 |
with gr.Row():
|
77 |
with gr.Column():
|
78 |
gr.Markdown("### User")
|
79 |
+
bool_inputs = gr.CheckboxGroup(["Car", "Property", "Mobile phone"], label="Which of the following do you actively hold or own?")
|
80 |
num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
|
81 |
household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
|
82 |
total_income = gr.Slider(**INCOME_MIN_MAX, label="Income", info="What's you total yearly income (in euros) ?")
|
|
|
132 |
needing to decrypt any value.
|
133 |
|
134 |
This server employs an [XGBoost](https://github.com/dmlc/xgboost) classifier model that has
|
135 |
+
been trained on a synthetic data-set.
|
136 |
"""
|
137 |
)
|
138 |
|
backend.py
CHANGED
@@ -15,24 +15,29 @@ from settings import (
|
|
15 |
CLIENT_FILES,
|
16 |
SERVER_FILES,
|
17 |
DEPLOYMENT_PATH,
|
18 |
-
|
19 |
INPUT_INDEXES,
|
20 |
INPUT_SLICES,
|
21 |
PRE_PROCESSOR_USER_PATH,
|
|
|
22 |
PRE_PROCESSOR_THIRD_PARTY_PATH,
|
23 |
CLIENT_TYPES,
|
24 |
USER_COLUMNS,
|
|
|
25 |
THIRD_PARTY_COLUMNS,
|
26 |
)
|
27 |
|
28 |
from utils.client_server_interface import MultiInputsFHEModelClient
|
29 |
|
30 |
# Load pre-processor instances
|
31 |
-
with
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
36 |
|
37 |
|
38 |
def shorten_bytes_object(bytes_object, limit=500):
|
@@ -124,8 +129,8 @@ def _send_to_server(client_id, client_type, file_name):
|
|
124 |
|
125 |
Args:
|
126 |
client_id (int): The client ID to consider.
|
127 |
-
client_type (Optional[str]): The type of client to consider (either 'user', 'bank',
|
128 |
-
None).
|
129 |
file_name (str): File name to send (either 'evaluation_key' or 'encrypted_inputs').
|
130 |
"""
|
131 |
# Get the paths to the encrypted inputs
|
@@ -213,7 +218,7 @@ def _encrypt_send(client_id, inputs, client_type):
|
|
213 |
encrypted_inputs = client.quantize_encrypt_serialize_multi_inputs(
|
214 |
inputs,
|
215 |
input_index=INPUT_INDEXES[client_type],
|
216 |
-
|
217 |
input_slice=INPUT_SLICES[client_type],
|
218 |
)
|
219 |
|
@@ -251,16 +256,12 @@ def pre_process_encrypt_send_user(client_id, *inputs):
|
|
251 |
# Retrieve boolean values
|
252 |
own_car = "Car" in bool_inputs
|
253 |
own_property = "Property" in bool_inputs
|
254 |
-
|
255 |
-
phone = "Phone" in bool_inputs
|
256 |
-
email = "Email" in bool_inputs
|
257 |
|
258 |
user_inputs = pandas.DataFrame({
|
259 |
"Own_car": [own_car],
|
260 |
"Own_property": [own_property],
|
261 |
-
"
|
262 |
-
"Phone": [phone],
|
263 |
-
"Email": [email],
|
264 |
"Num_children": [num_children],
|
265 |
"Household_size": [household_size],
|
266 |
"Total_income": [total_income],
|
@@ -291,8 +292,16 @@ def pre_process_encrypt_send_bank(client_id, *inputs):
|
|
291 |
the encrypted input to send.
|
292 |
"""
|
293 |
account_age = inputs[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
-
return _encrypt_send(client_id,
|
296 |
|
297 |
|
298 |
def pre_process_encrypt_send_third_party(client_id, *inputs):
|
@@ -421,5 +430,4 @@ def decrypt_output(client_id):
|
|
421 |
# Determine the predicted class
|
422 |
output = numpy.argmax(output_proba, axis=1).squeeze()
|
423 |
|
424 |
-
|
425 |
-
return "Credit card is likely to be approved ✅" if output == 0 else "Credit card is likely to be denied ❌"
|
|
|
15 |
CLIENT_FILES,
|
16 |
SERVER_FILES,
|
17 |
DEPLOYMENT_PATH,
|
18 |
+
PROCESSED_INPUT_SHAPE,
|
19 |
INPUT_INDEXES,
|
20 |
INPUT_SLICES,
|
21 |
PRE_PROCESSOR_USER_PATH,
|
22 |
+
PRE_PROCESSOR_BANK_PATH,
|
23 |
PRE_PROCESSOR_THIRD_PARTY_PATH,
|
24 |
CLIENT_TYPES,
|
25 |
USER_COLUMNS,
|
26 |
+
BANK_COLUMNS,
|
27 |
THIRD_PARTY_COLUMNS,
|
28 |
)
|
29 |
|
30 |
from utils.client_server_interface import MultiInputsFHEModelClient
|
31 |
|
32 |
# Load pre-processor instances
|
33 |
+
with (
|
34 |
+
PRE_PROCESSOR_USER_PATH.open('rb') as file_user,
|
35 |
+
PRE_PROCESSOR_BANK_PATH.open('rb') as file_bank,
|
36 |
+
PRE_PROCESSOR_THIRD_PARTY_PATH.open('rb') as file_third_party,
|
37 |
+
):
|
38 |
+
PRE_PROCESSOR_USER = pickle.load(file_user)
|
39 |
+
PRE_PROCESSOR_BANK = pickle.load(file_bank)
|
40 |
+
PRE_PROCESSOR_THIRD_PARTY = pickle.load(file_third_party)
|
41 |
|
42 |
|
43 |
def shorten_bytes_object(bytes_object, limit=500):
|
|
|
129 |
|
130 |
Args:
|
131 |
client_id (int): The client ID to consider.
|
132 |
+
client_type (Optional[str]): The type of client to consider (either 'user', 'bank',
|
133 |
+
'third_party' or None).
|
134 |
file_name (str): File name to send (either 'evaluation_key' or 'encrypted_inputs').
|
135 |
"""
|
136 |
# Get the paths to the encrypted inputs
|
|
|
218 |
encrypted_inputs = client.quantize_encrypt_serialize_multi_inputs(
|
219 |
inputs,
|
220 |
input_index=INPUT_INDEXES[client_type],
|
221 |
+
processed_input_shape=PROCESSED_INPUT_SHAPE,
|
222 |
input_slice=INPUT_SLICES[client_type],
|
223 |
)
|
224 |
|
|
|
256 |
# Retrieve boolean values
|
257 |
own_car = "Car" in bool_inputs
|
258 |
own_property = "Property" in bool_inputs
|
259 |
+
mobile_phone = "Mobile phone" in bool_inputs
|
|
|
|
|
260 |
|
261 |
user_inputs = pandas.DataFrame({
|
262 |
"Own_car": [own_car],
|
263 |
"Own_property": [own_property],
|
264 |
+
"Mobile_phone": [mobile_phone],
|
|
|
|
|
265 |
"Num_children": [num_children],
|
266 |
"Household_size": [household_size],
|
267 |
"Total_income": [total_income],
|
|
|
292 |
the encrypted input to send.
|
293 |
"""
|
294 |
account_age = inputs[0]
|
295 |
+
|
296 |
+
bank_inputs = pandas.DataFrame({
|
297 |
+
"Account_age": [account_age],
|
298 |
+
})
|
299 |
+
|
300 |
+
bank_inputs = bank_inputs.reindex(BANK_COLUMNS, axis=1)
|
301 |
+
|
302 |
+
preprocessed_bank_inputs = PRE_PROCESSOR_BANK.transform(bank_inputs)
|
303 |
|
304 |
+
return _encrypt_send(client_id, preprocessed_bank_inputs, "bank")
|
305 |
|
306 |
|
307 |
def pre_process_encrypt_send_third_party(client_id, *inputs):
|
|
|
430 |
# Determine the predicted class
|
431 |
output = numpy.argmax(output_proba, axis=1).squeeze()
|
432 |
|
433 |
+
return "Credit card is likely to be approved ✅" if output == 1 else "Credit card is likely to be denied ❌"
|
|
data/data.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/gpt_data.csv
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ID,Car,Property,Work phone,Phone,Email,Number of children,Household size,Income,Age,Income type,Education,Family,Occupation,Housing,Account age (months),Employed,Years of employment,Credit card Approval
|
2 |
+
0,Yes,Yes,No,Yes,Yes,2,4,50000,35,Salaried,Higher education,Married,Engineer,House / apartment,36,Yes,5,Yes
|
3 |
+
1,No,No,Yes,Yes,No,0,1,20000,50,Pensioner,Secondary,Widow,Retired,Rented apartment,12,No,0,No
|
4 |
+
2,Yes,Yes,No,No,Yes,1,3,75000,40,Self-employed,Higher education,Civil marriage,Business Owner,House / apartment,48,Yes,10,Yes
|
5 |
+
3,No,Yes,Yes,No,No,1,2,30000,28,Salaried,Secondary,Single,Teacher,With parents,24,Yes,3,Yes
|
6 |
+
4,Yes,No,No,Yes,Yes,0,1,45000,32,Self-employed,Higher education,Divorced,Freelancer,Rented apartment,60,Yes,7,Yes
|
7 |
+
5,No,Yes,Yes,Yes,No,3,5,35000,45,Salaried,Incomplete higher,Married,Clerk,House / apartment,30,Yes,10,Yes
|
8 |
+
6,Yes,No,No,No,Yes,0,1,25000,55,Pensioner,Lower secondary,Widow,Retired,House / apartment,120,No,0,No
|
9 |
+
7,No,Yes,Yes,Yes,Yes,2,3,60000,37,Salaried,Higher education,Civil marriage,Manager,House / apartment,36,Yes,12,Yes
|
10 |
+
8,Yes,Yes,Yes,Yes,Yes,0,1,80000,29,Self-employed,Higher education,Single,Entrepreneur,House / apartment,48,Yes,6,Yes
|
11 |
+
9,No,Yes,No,Yes,No,4,6,40000,38,Salaried,Secondary,Married,Salesperson,With parents,60,Yes,15,Yes
|
12 |
+
10,Yes,No,Yes,No,Yes,1,2,55000,26,Self-employed,Higher education,Civil marriage,Designer,Rented apartment,12,Yes,3,Yes
|
13 |
+
11,No,No,No,No,No,0,1,30000,60,Pensioner,Lower secondary,Widow,Retired,House / apartment,180,No,0,No
|
14 |
+
12,Yes,Yes,Yes,Yes,Yes,2,4,70000,45,Salaried,Higher education,Married,Manager,House / apartment,72,Yes,20,Yes
|
15 |
+
13,No,Yes,Yes,Yes,Yes,3,5,65000,50,Salaried,Incomplete higher,Divorced,Teacher,House / apartment,96,Yes,25,Yes
|
16 |
+
14,Yes,No,Yes,Yes,No,0,1,85000,33,Self-employed,Higher education,Single,Consultant,Rented apartment,36,Yes,10,Yes
|
17 |
+
15,No,Yes,No,No,Yes,1,3,32000,55,Pensioner,Secondary,Widow,Retired,House / apartment,150,No,0,No
|
18 |
+
16,Yes,Yes,Yes,Yes,Yes,2,4,90000,41,Salaried,Higher education,Married,Doctor,House / apartment,60,Yes,15,Yes
|
19 |
+
17,No,No,No,Yes,Yes,0,1,23000,24,Student,Incomplete higher,Single,Student,With parents,6,No,0,No
|
20 |
+
18,Yes,No,Yes,No,Yes,1,3,48000,35,Self-employed,Secondary,Divorced,Chef,Rented apartment,24,Yes,5,Yes
|
21 |
+
19,No,Yes,No,Yes,No,3,5,55000,50,Salaried,Higher education,Widow,Engineer,House / apartment,120,Yes,20,Yes
|
22 |
+
20,Yes,No,Yes,Yes,Yes,0,2,70000,30,Salaried,Higher education,Civil marriage,Lawyer,Rented apartment,36,Yes,8,Yes
|
23 |
+
21,No,Yes,Yes,No,Yes,2,4,32000,40,Self-employed,Secondary,Married,Carpenter,House / apartment,48,Yes,12,Yes
|
24 |
+
22,Yes,No,No,Yes,No,1,2,26000,29,Student,Higher education,Single,Student,With parents,12,No,0,No
|
25 |
+
23,No,Yes,Yes,Yes,Yes,0,1,45000,55,Pensioner,Lower secondary,Divorced,Retired,House / apartment,180,No,0,No
|
26 |
+
24,Yes,Yes,No,Yes,Yes,1,3,80000,46,Salaried,Higher education,Married,Architect,House / apartment,72,Yes,18,Yes
|
27 |
+
25,No,No,Yes,No,No,2,5,37000,33,Self-employed,Secondary,Single,Plumber,With parents,24,Yes,6,Yes
|
28 |
+
26,Yes,Yes,Yes,Yes,No,0,1,65000,28,Self-employed,Higher education,Single,Graphic Designer,Rented apartment,36,Yes,4,Yes
|
29 |
+
27,No,No,No,No,Yes,1,2,29000,53,Pensioner,Incomplete higher,Widow,Retired,House / apartment,144,No,0,No
|
30 |
+
28,Yes,Yes,Yes,No,Yes,3,6,58000,39,Salaried,Secondary,Divorced,Teacher,House / apartment,60,Yes,10,Yes
|
31 |
+
29,No,Yes,No,Yes,No,0,1,40000,31,Salaried,Higher education,Single,Nurse,Rented apartment,48,Yes,7,Yes
|
32 |
+
30,Yes,No,Yes,Yes,Yes,2,4,51000,42,Self-employed,Higher education,Civil marriage,Business Owner,House / apartment,60,Yes,14,Yes
|
33 |
+
31,No,Yes,No,No,No,1,3,34000,47,Salaried,Secondary,Married,Factory Worker,With parents,72,Yes,9,No
|
34 |
+
32,No,No,Yes,Yes,Yes,0,1,85000,25,Self-employed,Higher education,Single,Entrepreneur,Rented apartment,12,Yes,2,Yes
|
35 |
+
33,Yes,Yes,No,No,No,3,5,30000,55,Salaried,Secondary,Married,Teacher,House / apartment,240,Yes,30,Yes
|
36 |
+
34,No,No,Yes,Yes,Yes,1,2,45000,40,Salaried,Higher education,Divorced,Nurse,With parents,36,Yes,10,Yes
|
37 |
+
35,Yes,No,No,Yes,No,0,1,20000,50,Pensioner,Lower secondary,Widow,Retired,Rented apartment,60,No,0,No
|
38 |
+
36,No,Yes,Yes,No,Yes,2,4,35000,30,Salaried,Higher education,Civil marriage,Clerk,House / apartment,24,Yes,5,Yes
|
39 |
+
37,Yes,No,Yes,Yes,No,1,3,60000,33,Self-employed,Secondary,Single,Freelancer,Rented apartment,48,Yes,7,No
|
40 |
+
38,No,No,No,No,Yes,0,1,75000,28,Salaried,Higher education,Single,Manager,With parents,12,Yes,3,Yes
|
41 |
+
39,Yes,Yes,Yes,Yes,No,3,6,40000,45,Salaried,Incomplete higher,Married,Salesperson,House / apartment,96,Yes,15,Yes
|
42 |
+
40,Yes,No,Yes,Yes,Yes,0,1,30000,65,Pensioner,Higher education,Widow,Retired,House / apartment,420,No,40,Yes
|
43 |
+
41,No,Yes,No,No,No,1,2,22000,26,Salaried,Secondary,Single,Junior Clerk,With parents,24,Yes,2,No
|
44 |
+
42,Yes,No,Yes,Yes,No,2,4,18000,35,Salaried,Lower secondary,Married,Factory Worker,House / apartment,60,Yes,10,Yes
|
45 |
+
43,No,No,No,Yes,Yes,0,1,50000,55,Self-employed,Higher education,Divorced,Consultant,Rented apartment,180,Yes,30,Yes
|
46 |
+
44,Yes,Yes,Yes,No,No,3,5,25000,40,Salaried,Incomplete higher,Married,Cook,House / apartment,120,Yes,15,No
|
47 |
+
45,No,Yes,No,Yes,Yes,1,2,35000,28,Self-employed,Secondary,Single,Graphic Designer,Rented apartment,36,Yes,5,Yes
|
48 |
+
46,Yes,No,Yes,No,Yes,0,1,45000,30,Salaried,Higher education,Civil marriage,Engineer,With parents,48,Yes,8,Yes
|
49 |
+
47,No,Yes,Yes,Yes,No,2,3,27000,48,Salaried,Secondary,Divorced,Salesperson,House / apartment,240,Yes,20,No
|
50 |
+
48,Yes,No,Yes,Yes,No,2,4,40000,45,Self-employed,Secondary,Married,Artist,House / apartment,120,Yes,15,Yes
|
51 |
+
49,No,Yes,No,No,Yes,0,1,55000,50,Salaried,Higher education,Widow,Scientist,Rented apartment,300,No,25,Yes
|
52 |
+
50,Yes,No,Yes,Yes,Yes,1,2,26000,27,Salaried,Incomplete higher,Single,Junior Accountant,With parents,36,Yes,3,No
|
53 |
+
51,No,No,No,No,No,3,6,22000,38,Salaried,Lower secondary,Divorced,Bus Driver,House / apartment,72,Yes,10,Yes
|
54 |
+
52,Yes,Yes,Yes,No,Yes,0,1,32000,55,Pensioner,Higher education,Married,Retired,House / apartment,420,No,30,No
|
55 |
+
53,No,Yes,No,Yes,No,2,3,29000,33,Self-employed,Secondary,Civil marriage,Shop Owner,Rented apartment,60,Yes,7,Yes
|
56 |
+
54,Yes,No,Yes,Yes,Yes,1,2,68000,29,Salaried,Higher education,Single,Software Developer,With parents,48,Yes,6,Yes
|
57 |
+
55,No,No,No,No,Yes,0,1,36000,42,Self-employed,Incomplete higher,Divorced,Freelancer,House / apartment,96,Yes,12,No
|
deployment_files/client.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:859c5895ff365bfd7fa8b592784717d14094c29aaee032c2f23716d225c15855
|
3 |
+
size 29354
|
deployment_files/pre_processor_bank.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c76179d6cb576a61f6d57e29f45545d8c4dd2d86a3818730f5c6bdd35faebe4a
|
3 |
+
size 1098
|
deployment_files/pre_processor_third_party.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8bc9d12a6ac40e3b7f88f3f15e688879b819bc4a995ad0e91fc08af85f55513b
|
3 |
+
size 1128
|
deployment_files/pre_processor_user.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4300414ee222355b2ac9e0f5e4dc21b50f0a4de8e17e0ed4121dc766231d010
|
3 |
+
size 3340
|
deployment_files/server.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3490a405f8634b3c95f6a2d3fb8cf0276e3f49adc4b25911afab7f97524c2f7a
|
3 |
+
size 2308
|
development.py
CHANGED
@@ -5,16 +5,12 @@ import numpy
|
|
5 |
import pandas
|
6 |
import pickle
|
7 |
|
8 |
-
from sklearn.model_selection import train_test_split
|
9 |
-
from sklearn.metrics import accuracy_score
|
10 |
-
from imblearn.over_sampling import SMOTE
|
11 |
-
|
12 |
from settings import (
|
13 |
-
DEPLOYMENT_PATH,
|
14 |
-
RANDOM_STATE,
|
15 |
DATA_PATH,
|
16 |
INPUT_SLICES,
|
17 |
PRE_PROCESSOR_USER_PATH,
|
|
|
18 |
PRE_PROCESSOR_THIRD_PARTY_PATH,
|
19 |
USER_COLUMNS,
|
20 |
BANK_COLUMNS,
|
@@ -34,19 +30,12 @@ def get_processed_multi_inputs(data):
|
|
34 |
|
35 |
print("Load and pre-process the data")
|
36 |
|
37 |
-
#
|
38 |
-
# https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction/data
|
39 |
-
# It then has been cleaned using the following notebook :
|
40 |
-
# https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
|
41 |
-
# A few additional pre-processing steps has bee applied to this data set as well :
|
42 |
-
# - "ID" column has been removed
|
43 |
-
# - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
|
44 |
-
# salary one from 2023 (22050 euros)
|
45 |
data = pandas.read_csv(DATA_PATH, encoding="utf-8")
|
46 |
|
47 |
# Define input and target data
|
48 |
data_x = data.copy()
|
49 |
-
data_y = data_x.pop("Target").copy()
|
50 |
|
51 |
# Get data from all parties
|
52 |
data_user = data_x[USER_COLUMNS].copy()
|
@@ -54,30 +43,22 @@ data_bank = data_x[BANK_COLUMNS].copy()
|
|
54 |
data_third_party = data_x[THIRD_PARTY_COLUMNS].copy()
|
55 |
|
56 |
# Feature engineer the data
|
57 |
-
pre_processor_user, pre_processor_third_party = get_pre_processors()
|
58 |
|
59 |
preprocessed_data_user = pre_processor_user.fit_transform(data_user)
|
60 |
-
preprocessed_data_bank =
|
61 |
preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party)
|
62 |
|
63 |
preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1)
|
64 |
|
65 |
-
# The initial data-set is very imbalanced: use SMOTE to get better results
|
66 |
-
x, y = SMOTE().fit_resample(preprocessed_data_x, data_y)
|
67 |
-
|
68 |
-
# Retrieve the training and testing data
|
69 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
70 |
-
x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE
|
71 |
-
)
|
72 |
-
|
73 |
|
74 |
print("\nTrain and compile the model")
|
75 |
|
76 |
-
model = MultiInputXGBClassifier(max_depth=3, n_estimators=
|
77 |
|
78 |
-
model, sklearn_model = model.fit_benchmark(
|
79 |
|
80 |
-
multi_inputs_train = get_processed_multi_inputs(
|
81 |
|
82 |
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])
|
83 |
|
@@ -86,19 +67,6 @@ if DEPLOYMENT_PATH.is_dir():
|
|
86 |
shutil.rmtree(DEPLOYMENT_PATH)
|
87 |
|
88 |
|
89 |
-
print("\nEvaluate the models")
|
90 |
-
|
91 |
-
y_pred_sklearn = sklearn_model.predict(X_test)
|
92 |
-
|
93 |
-
print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%")
|
94 |
-
|
95 |
-
multi_inputs_test = get_processed_multi_inputs(X_test)
|
96 |
-
|
97 |
-
y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True)
|
98 |
-
|
99 |
-
print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%")
|
100 |
-
|
101 |
-
|
102 |
print("\nSave deployment files")
|
103 |
|
104 |
# Save files needed for deployment (and enable cross-platform deployment)
|
@@ -106,10 +74,13 @@ fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model)
|
|
106 |
fhe_dev.save(via_mlir=True)
|
107 |
|
108 |
# Save pre-processors
|
109 |
-
with
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
114 |
|
115 |
print("\nDone !")
|
|
|
5 |
import pandas
|
6 |
import pickle
|
7 |
|
|
|
|
|
|
|
|
|
8 |
from settings import (
|
9 |
+
DEPLOYMENT_PATH,
|
|
|
10 |
DATA_PATH,
|
11 |
INPUT_SLICES,
|
12 |
PRE_PROCESSOR_USER_PATH,
|
13 |
+
PRE_PROCESSOR_BANK_PATH,
|
14 |
PRE_PROCESSOR_THIRD_PARTY_PATH,
|
15 |
USER_COLUMNS,
|
16 |
BANK_COLUMNS,
|
|
|
30 |
|
31 |
print("Load and pre-process the data")
|
32 |
|
33 |
+
# Load the data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
data = pandas.read_csv(DATA_PATH, encoding="utf-8")
|
35 |
|
36 |
# Define input and target data
|
37 |
data_x = data.copy()
|
38 |
+
data_y = data_x.pop("Target").copy().to_frame()
|
39 |
|
40 |
# Get data from all parties
|
41 |
data_user = data_x[USER_COLUMNS].copy()
|
|
|
43 |
data_third_party = data_x[THIRD_PARTY_COLUMNS].copy()
|
44 |
|
45 |
# Feature engineer the data
|
46 |
+
pre_processor_user, pre_processor_bank, pre_processor_third_party = get_pre_processors()
|
47 |
|
48 |
preprocessed_data_user = pre_processor_user.fit_transform(data_user)
|
49 |
+
preprocessed_data_bank = pre_processor_bank.fit_transform(data_bank)
|
50 |
preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party)
|
51 |
|
52 |
preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
print("\nTrain and compile the model")
|
56 |
|
57 |
+
model = MultiInputXGBClassifier(max_depth=3, n_estimators=20)
|
58 |
|
59 |
+
model, sklearn_model = model.fit_benchmark(preprocessed_data_x, data_y)
|
60 |
|
61 |
+
multi_inputs_train = get_processed_multi_inputs(preprocessed_data_x)
|
62 |
|
63 |
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])
|
64 |
|
|
|
67 |
shutil.rmtree(DEPLOYMENT_PATH)
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
print("\nSave deployment files")
|
71 |
|
72 |
# Save files needed for deployment (and enable cross-platform deployment)
|
|
|
74 |
fhe_dev.save(via_mlir=True)
|
75 |
|
76 |
# Save pre-processors
|
77 |
+
with (
|
78 |
+
PRE_PROCESSOR_USER_PATH.open('wb') as file_user,
|
79 |
+
PRE_PROCESSOR_BANK_PATH.open('wb') as file_bank,
|
80 |
+
PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file_third_party,
|
81 |
+
):
|
82 |
+
pickle.dump(pre_processor_user, file_user)
|
83 |
+
pickle.dump(pre_processor_bank, file_bank)
|
84 |
+
pickle.dump(pre_processor_third_party, file_third_party)
|
85 |
|
86 |
print("\nDone !")
|
requirements.txt
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
gradio==3.40.1
|
2 |
-
concrete-ml==1.3.0
|
3 |
-
imbalanced-learn==0.11.0
|
|
|
1 |
gradio==3.40.1
|
2 |
+
concrete-ml==1.3.0
|
|
settings.py
CHANGED
@@ -14,6 +14,7 @@ SERVER_FILES = REPO_DIR / "server_files"
|
|
14 |
|
15 |
# Path targeting pre-processor saved files
|
16 |
PRE_PROCESSOR_USER_PATH = DEPLOYMENT_PATH / 'pre_processor_user.pkl'
|
|
|
17 |
PRE_PROCESSOR_THIRD_PARTY_PATH = DEPLOYMENT_PATH / 'pre_processor_third_party.pkl'
|
18 |
|
19 |
# Create the necessary directories
|
@@ -25,13 +26,10 @@ SERVER_FILES.mkdir(exist_ok=True)
|
|
25 |
SERVER_URL = "http://localhost:8000/"
|
26 |
|
27 |
# Path to data file
|
28 |
-
# Details about pre-processing steps can be found in the 'development.py' and 'pre_processing.py'
|
29 |
-
# files
|
30 |
DATA_PATH = "data/data.csv"
|
31 |
|
32 |
# Development settings
|
33 |
-
|
34 |
-
INITIAL_INPUT_SHAPE = (1, 49)
|
35 |
|
36 |
CLIENT_TYPES = ["user", "bank", "third_party"]
|
37 |
INPUT_INDEXES = {
|
@@ -40,13 +38,13 @@ INPUT_INDEXES = {
|
|
40 |
"third_party": 2,
|
41 |
}
|
42 |
INPUT_SLICES = {
|
43 |
-
"user": slice(0,
|
44 |
-
"bank": slice(
|
45 |
-
"third_party": slice(
|
46 |
}
|
47 |
|
48 |
USER_COLUMNS = [
|
49 |
-
'Own_car', 'Own_property', '
|
50 |
'Total_income', 'Age', 'Income_type', 'Education_type', 'Family_status', 'Housing_type',
|
51 |
'Occupation_type',
|
52 |
]
|
|
|
14 |
|
15 |
# Path targeting pre-processor saved files
|
16 |
PRE_PROCESSOR_USER_PATH = DEPLOYMENT_PATH / 'pre_processor_user.pkl'
|
17 |
+
PRE_PROCESSOR_BANK_PATH = DEPLOYMENT_PATH / 'pre_processor_bank.pkl'
|
18 |
PRE_PROCESSOR_THIRD_PARTY_PATH = DEPLOYMENT_PATH / 'pre_processor_third_party.pkl'
|
19 |
|
20 |
# Create the necessary directories
|
|
|
26 |
SERVER_URL = "http://localhost:8000/"
|
27 |
|
28 |
# Path to data file
|
|
|
|
|
29 |
DATA_PATH = "data/data.csv"
|
30 |
|
31 |
# Development settings
|
32 |
+
PROCESSED_INPUT_SHAPE = (1, 39)
|
|
|
33 |
|
34 |
CLIENT_TYPES = ["user", "bank", "third_party"]
|
35 |
INPUT_INDEXES = {
|
|
|
38 |
"third_party": 2,
|
39 |
}
|
40 |
INPUT_SLICES = {
|
41 |
+
"user": slice(0, 36), # First position: start from 0
|
42 |
+
"bank": slice(36, 37), # Second position: start from n_feature_user
|
43 |
+
"third_party": slice(37, 39), # Third position: start from n_feature_user + n_feature_bank
|
44 |
}
|
45 |
|
46 |
USER_COLUMNS = [
|
47 |
+
'Own_car', 'Own_property', 'Mobile_phone', 'Num_children', 'Household_size',
|
48 |
'Total_income', 'Age', 'Income_type', 'Education_type', 'Family_status', 'Housing_type',
|
49 |
'Occupation_type',
|
50 |
]
|
utils/client_server_interface.py
CHANGED
@@ -27,9 +27,15 @@ class MultiInputsFHEModelClient(FHEModelClient):
|
|
27 |
|
28 |
super().__init__(*args, **kwargs)
|
29 |
|
30 |
-
def quantize_encrypt_serialize_multi_inputs(
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
x_padded = numpy.zeros(
|
33 |
|
34 |
x_padded[:, input_slice] = x
|
35 |
|
|
|
27 |
|
28 |
super().__init__(*args, **kwargs)
|
29 |
|
30 |
+
def quantize_encrypt_serialize_multi_inputs(
|
31 |
+
self,
|
32 |
+
x: numpy.ndarray,
|
33 |
+
input_index,
|
34 |
+
processed_input_shape,
|
35 |
+
input_slice
|
36 |
+
) -> bytes:
|
37 |
|
38 |
+
x_padded = numpy.zeros(processed_input_shape)
|
39 |
|
40 |
x_padded[:, input_slice] = x
|
41 |
|
utils/pre_processing.py
CHANGED
@@ -1,16 +1,9 @@
|
|
1 |
-
"""Data pre-processing functions.
|
2 |
-
|
3 |
-
The pre-processing steps are heavily inspired by the following notebook :
|
4 |
-
https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml
|
5 |
-
|
6 |
-
Additional steps, mostly including renaming some values or features, were added for better user
|
7 |
-
experience.
|
8 |
-
"""
|
9 |
|
10 |
import numpy
|
11 |
from sklearn.compose import ColumnTransformer
|
12 |
from sklearn.pipeline import Pipeline
|
13 |
-
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer,
|
14 |
|
15 |
|
16 |
def _get_pipeline_replace_one_hot(func, value):
|
@@ -23,10 +16,6 @@ def _get_pipeline_replace_one_hot(func, value):
|
|
23 |
("one_hot", OneHotEncoder(),),
|
24 |
])
|
25 |
|
26 |
-
|
27 |
-
def _replace_values_geq(column, value):
|
28 |
-
return numpy.where(column >= value, f"{value}_or_more", column)
|
29 |
-
|
30 |
def _replace_values_eq(column, value):
|
31 |
for desired_value, values_to_replace in value.items():
|
32 |
column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
|
@@ -35,41 +24,32 @@ def _replace_values_eq(column, value):
|
|
35 |
def get_pre_processors():
|
36 |
pre_processor_user = ColumnTransformer(
|
37 |
transformers=[
|
38 |
-
(
|
39 |
-
"replace_num_children",
|
40 |
-
_get_pipeline_replace_one_hot(_replace_values_geq, 2),
|
41 |
-
['Num_children']
|
42 |
-
),
|
43 |
-
(
|
44 |
-
"replace_household_size",
|
45 |
-
_get_pipeline_replace_one_hot(_replace_values_geq, 3),
|
46 |
-
['Household_size']
|
47 |
-
),
|
48 |
-
(
|
49 |
-
"replace_income_type",
|
50 |
-
_get_pipeline_replace_one_hot(_replace_values_eq, {"Public Sector": ["Retired", "Student"]}),
|
51 |
-
['Income_type']
|
52 |
-
),
|
53 |
-
(
|
54 |
-
"replace_education_type",
|
55 |
-
_get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}),
|
56 |
-
['Education_type']
|
57 |
-
),
|
58 |
(
|
59 |
"replace_occupation_type_labor",
|
60 |
_get_pipeline_replace_one_hot(
|
61 |
_replace_values_eq,
|
62 |
{
|
63 |
-
"Labor_work": [
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
66 |
},
|
67 |
),
|
68 |
['Occupation_type']
|
69 |
),
|
70 |
-
('
|
71 |
-
('
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
],
|
74 |
remainder='passthrough',
|
75 |
verbose_feature_names_out=False,
|
@@ -77,10 +57,10 @@ def get_pre_processors():
|
|
77 |
|
78 |
pre_processor_third_party = ColumnTransformer(
|
79 |
transformers=[
|
80 |
-
('
|
81 |
],
|
82 |
remainder='passthrough',
|
83 |
verbose_feature_names_out=False,
|
84 |
)
|
85 |
|
86 |
-
return pre_processor_user, pre_processor_third_party
|
|
|
1 |
+
"""Data pre-processing functions."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import numpy
|
4 |
from sklearn.compose import ColumnTransformer
|
5 |
from sklearn.pipeline import Pipeline
|
6 |
+
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
|
7 |
|
8 |
|
9 |
def _get_pipeline_replace_one_hot(func, value):
|
|
|
16 |
("one_hot", OneHotEncoder(),),
|
17 |
])
|
18 |
|
|
|
|
|
|
|
|
|
19 |
def _replace_values_eq(column, value):
|
20 |
for desired_value, values_to_replace in value.items():
|
21 |
column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
|
|
|
24 |
def get_pre_processors():
|
25 |
pre_processor_user = ColumnTransformer(
|
26 |
transformers=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
(
|
28 |
"replace_occupation_type_labor",
|
29 |
_get_pipeline_replace_one_hot(
|
30 |
_replace_values_eq,
|
31 |
{
|
32 |
+
"Labor_work": [
|
33 |
+
"Cooking Staff", "Carpenter", "Plumber", "Factory Worker", "Bus Driver"
|
34 |
+
],
|
35 |
+
"Office_work": [
|
36 |
+
"Business Owners", "Office Worker", "Accountant", "Entrepreneur", "Salesperson"
|
37 |
+
],
|
38 |
+
"High_tech_work": ["Engineer", "Manager", "Consultant", "Software Developer"],
|
39 |
},
|
40 |
),
|
41 |
['Occupation_type']
|
42 |
),
|
43 |
+
('one_hot_others', OneHotEncoder(), ['Housing_type', 'Family_status', 'Education_type', 'Income_type']),
|
44 |
+
('standard_scaler', StandardScaler(), ['Num_children', 'Household_size', 'Total_income', 'Age']),
|
45 |
+
],
|
46 |
+
remainder='passthrough',
|
47 |
+
verbose_feature_names_out=False,
|
48 |
+
)
|
49 |
+
|
50 |
+
pre_processor_bank = ColumnTransformer(
|
51 |
+
transformers=[
|
52 |
+
('standard_scaler', StandardScaler(), ['Account_age']),
|
53 |
],
|
54 |
remainder='passthrough',
|
55 |
verbose_feature_names_out=False,
|
|
|
57 |
|
58 |
pre_processor_third_party = ColumnTransformer(
|
59 |
transformers=[
|
60 |
+
('standard_scaler', StandardScaler(), ['Years_employed']),
|
61 |
],
|
62 |
remainder='passthrough',
|
63 |
verbose_feature_names_out=False,
|
64 |
)
|
65 |
|
66 |
+
return pre_processor_user, pre_processor_bank, pre_processor_third_party
|