romanbredehoft-zama commited on
Commit
18ba8c1
1 Parent(s): e3103dd

Update to synthetic data-set

Browse files
app.py CHANGED
@@ -76,7 +76,7 @@ with demo:
76
  with gr.Row():
77
  with gr.Column():
78
  gr.Markdown("### User")
79
- bool_inputs = gr.CheckboxGroup(["Car", "Property", "Work phone", "Phone", "Email"], label="Which of the following do you actively hold or own?")
80
  num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
81
  household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
82
  total_income = gr.Slider(**INCOME_MIN_MAX, label="Income", info="What's you total yearly income (in euros) ?")
@@ -132,7 +132,7 @@ with demo:
132
  needing to decrypt any value.
133
 
134
  This server employs an [XGBoost](https://github.com/dmlc/xgboost) classifier model that has
135
- been trained on [this credit card data-set](https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction/data).
136
  """
137
  )
138
 
 
76
  with gr.Row():
77
  with gr.Column():
78
  gr.Markdown("### User")
79
+ bool_inputs = gr.CheckboxGroup(["Car", "Property", "Mobile phone"], label="Which of the following do you actively hold or own?")
80
  num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
81
  household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
82
  total_income = gr.Slider(**INCOME_MIN_MAX, label="Income", info="What's you total yearly income (in euros) ?")
 
132
  needing to decrypt any value.
133
 
134
  This server employs an [XGBoost](https://github.com/dmlc/xgboost) classifier model that has
135
+ been trained on a synthetic data-set.
136
  """
137
  )
138
 
backend.py CHANGED
@@ -15,24 +15,29 @@ from settings import (
15
  CLIENT_FILES,
16
  SERVER_FILES,
17
  DEPLOYMENT_PATH,
18
- INITIAL_INPUT_SHAPE,
19
  INPUT_INDEXES,
20
  INPUT_SLICES,
21
  PRE_PROCESSOR_USER_PATH,
 
22
  PRE_PROCESSOR_THIRD_PARTY_PATH,
23
  CLIENT_TYPES,
24
  USER_COLUMNS,
 
25
  THIRD_PARTY_COLUMNS,
26
  )
27
 
28
  from utils.client_server_interface import MultiInputsFHEModelClient
29
 
30
  # Load pre-processor instances
31
- with PRE_PROCESSOR_USER_PATH.open('rb') as file:
32
- PRE_PROCESSOR_USER = pickle.load(file)
33
-
34
- with PRE_PROCESSOR_THIRD_PARTY_PATH.open('rb') as file:
35
- PRE_PROCESSOR_THIRD_PARTY = pickle.load(file)
 
 
 
36
 
37
 
38
  def shorten_bytes_object(bytes_object, limit=500):
@@ -124,8 +129,8 @@ def _send_to_server(client_id, client_type, file_name):
124
 
125
  Args:
126
  client_id (int): The client ID to consider.
127
- client_type (Optional[str]): The type of client to consider (either 'user', 'bank', 'third_party' or
128
- None).
129
  file_name (str): File name to send (either 'evaluation_key' or 'encrypted_inputs').
130
  """
131
  # Get the paths to the encrypted inputs
@@ -213,7 +218,7 @@ def _encrypt_send(client_id, inputs, client_type):
213
  encrypted_inputs = client.quantize_encrypt_serialize_multi_inputs(
214
  inputs,
215
  input_index=INPUT_INDEXES[client_type],
216
- initial_input_shape=INITIAL_INPUT_SHAPE,
217
  input_slice=INPUT_SLICES[client_type],
218
  )
219
 
@@ -251,16 +256,12 @@ def pre_process_encrypt_send_user(client_id, *inputs):
251
  # Retrieve boolean values
252
  own_car = "Car" in bool_inputs
253
  own_property = "Property" in bool_inputs
254
- work_phone = "Work phone" in bool_inputs
255
- phone = "Phone" in bool_inputs
256
- email = "Email" in bool_inputs
257
 
258
  user_inputs = pandas.DataFrame({
259
  "Own_car": [own_car],
260
  "Own_property": [own_property],
261
- "Work_phone": [work_phone],
262
- "Phone": [phone],
263
- "Email": [email],
264
  "Num_children": [num_children],
265
  "Household_size": [household_size],
266
  "Total_income": [total_income],
@@ -291,8 +292,16 @@ def pre_process_encrypt_send_bank(client_id, *inputs):
291
  the encrypted input to send.
292
  """
293
  account_age = inputs[0]
 
 
 
 
 
 
 
 
294
 
295
- return _encrypt_send(client_id, account_age, "bank")
296
 
297
 
298
  def pre_process_encrypt_send_third_party(client_id, *inputs):
@@ -421,5 +430,4 @@ def decrypt_output(client_id):
421
  # Determine the predicted class
422
  output = numpy.argmax(output_proba, axis=1).squeeze()
423
 
424
- # A "0" output means approving the credit card has low risk, while "1" is high risk
425
- return "Credit card is likely to be approved ✅" if output == 0 else "Credit card is likely to be denied ❌"
 
15
  CLIENT_FILES,
16
  SERVER_FILES,
17
  DEPLOYMENT_PATH,
18
+ PROCESSED_INPUT_SHAPE,
19
  INPUT_INDEXES,
20
  INPUT_SLICES,
21
  PRE_PROCESSOR_USER_PATH,
22
+ PRE_PROCESSOR_BANK_PATH,
23
  PRE_PROCESSOR_THIRD_PARTY_PATH,
24
  CLIENT_TYPES,
25
  USER_COLUMNS,
26
+ BANK_COLUMNS,
27
  THIRD_PARTY_COLUMNS,
28
  )
29
 
30
  from utils.client_server_interface import MultiInputsFHEModelClient
31
 
32
  # Load pre-processor instances
33
+ with (
34
+ PRE_PROCESSOR_USER_PATH.open('rb') as file_user,
35
+ PRE_PROCESSOR_BANK_PATH.open('rb') as file_bank,
36
+ PRE_PROCESSOR_THIRD_PARTY_PATH.open('rb') as file_third_party,
37
+ ):
38
+ PRE_PROCESSOR_USER = pickle.load(file_user)
39
+ PRE_PROCESSOR_BANK = pickle.load(file_bank)
40
+ PRE_PROCESSOR_THIRD_PARTY = pickle.load(file_third_party)
41
 
42
 
43
  def shorten_bytes_object(bytes_object, limit=500):
 
129
 
130
  Args:
131
  client_id (int): The client ID to consider.
132
+ client_type (Optional[str]): The type of client to consider (either 'user', 'bank',
133
+ 'third_party' or None).
134
  file_name (str): File name to send (either 'evaluation_key' or 'encrypted_inputs').
135
  """
136
  # Get the paths to the encrypted inputs
 
218
  encrypted_inputs = client.quantize_encrypt_serialize_multi_inputs(
219
  inputs,
220
  input_index=INPUT_INDEXES[client_type],
221
+ processed_input_shape=PROCESSED_INPUT_SHAPE,
222
  input_slice=INPUT_SLICES[client_type],
223
  )
224
 
 
256
  # Retrieve boolean values
257
  own_car = "Car" in bool_inputs
258
  own_property = "Property" in bool_inputs
259
+ mobile_phone = "Mobile phone" in bool_inputs
 
 
260
 
261
  user_inputs = pandas.DataFrame({
262
  "Own_car": [own_car],
263
  "Own_property": [own_property],
264
+ "Mobile_phone": [mobile_phone],
 
 
265
  "Num_children": [num_children],
266
  "Household_size": [household_size],
267
  "Total_income": [total_income],
 
292
  the encrypted input to send.
293
  """
294
  account_age = inputs[0]
295
+
296
+ bank_inputs = pandas.DataFrame({
297
+ "Account_age": [account_age],
298
+ })
299
+
300
+ bank_inputs = bank_inputs.reindex(BANK_COLUMNS, axis=1)
301
+
302
+ preprocessed_bank_inputs = PRE_PROCESSOR_BANK.transform(bank_inputs)
303
 
304
+ return _encrypt_send(client_id, preprocessed_bank_inputs, "bank")
305
 
306
 
307
  def pre_process_encrypt_send_third_party(client_id, *inputs):
 
430
  # Determine the predicted class
431
  output = numpy.argmax(output_proba, axis=1).squeeze()
432
 
433
+ return "Credit card is likely to be approved ✅" if output == 1 else "Credit card is likely to be denied ❌"
 
data/data.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/gpt_data.csv ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ID,Car,Property,Work phone,Phone,Email,Number of children,Household size,Income,Age,Income type,Education,Family,Occupation,Housing,Account age (months),Employed,Years of employment,Credit card Approval
2
+ 0,Yes,Yes,No,Yes,Yes,2,4,50000,35,Salaried,Higher education,Married,Engineer,House / apartment,36,Yes,5,Yes
3
+ 1,No,No,Yes,Yes,No,0,1,20000,50,Pensioner,Secondary,Widow,Retired,Rented apartment,12,No,0,No
4
+ 2,Yes,Yes,No,No,Yes,1,3,75000,40,Self-employed,Higher education,Civil marriage,Business Owner,House / apartment,48,Yes,10,Yes
5
+ 3,No,Yes,Yes,No,No,1,2,30000,28,Salaried,Secondary,Single,Teacher,With parents,24,Yes,3,Yes
6
+ 4,Yes,No,No,Yes,Yes,0,1,45000,32,Self-employed,Higher education,Divorced,Freelancer,Rented apartment,60,Yes,7,Yes
7
+ 5,No,Yes,Yes,Yes,No,3,5,35000,45,Salaried,Incomplete higher,Married,Clerk,House / apartment,30,Yes,10,Yes
8
+ 6,Yes,No,No,No,Yes,0,1,25000,55,Pensioner,Lower secondary,Widow,Retired,House / apartment,120,No,0,No
9
+ 7,No,Yes,Yes,Yes,Yes,2,3,60000,37,Salaried,Higher education,Civil marriage,Manager,House / apartment,36,Yes,12,Yes
10
+ 8,Yes,Yes,Yes,Yes,Yes,0,1,80000,29,Self-employed,Higher education,Single,Entrepreneur,House / apartment,48,Yes,6,Yes
11
+ 9,No,Yes,No,Yes,No,4,6,40000,38,Salaried,Secondary,Married,Salesperson,With parents,60,Yes,15,Yes
12
+ 10,Yes,No,Yes,No,Yes,1,2,55000,26,Self-employed,Higher education,Civil marriage,Designer,Rented apartment,12,Yes,3,Yes
13
+ 11,No,No,No,No,No,0,1,30000,60,Pensioner,Lower secondary,Widow,Retired,House / apartment,180,No,0,No
14
+ 12,Yes,Yes,Yes,Yes,Yes,2,4,70000,45,Salaried,Higher education,Married,Manager,House / apartment,72,Yes,20,Yes
15
+ 13,No,Yes,Yes,Yes,Yes,3,5,65000,50,Salaried,Incomplete higher,Divorced,Teacher,House / apartment,96,Yes,25,Yes
16
+ 14,Yes,No,Yes,Yes,No,0,1,85000,33,Self-employed,Higher education,Single,Consultant,Rented apartment,36,Yes,10,Yes
17
+ 15,No,Yes,No,No,Yes,1,3,32000,55,Pensioner,Secondary,Widow,Retired,House / apartment,150,No,0,No
18
+ 16,Yes,Yes,Yes,Yes,Yes,2,4,90000,41,Salaried,Higher education,Married,Doctor,House / apartment,60,Yes,15,Yes
19
+ 17,No,No,No,Yes,Yes,0,1,23000,24,Student,Incomplete higher,Single,Student,With parents,6,No,0,No
20
+ 18,Yes,No,Yes,No,Yes,1,3,48000,35,Self-employed,Secondary,Divorced,Chef,Rented apartment,24,Yes,5,Yes
21
+ 19,No,Yes,No,Yes,No,3,5,55000,50,Salaried,Higher education,Widow,Engineer,House / apartment,120,Yes,20,Yes
22
+ 20,Yes,No,Yes,Yes,Yes,0,2,70000,30,Salaried,Higher education,Civil marriage,Lawyer,Rented apartment,36,Yes,8,Yes
23
+ 21,No,Yes,Yes,No,Yes,2,4,32000,40,Self-employed,Secondary,Married,Carpenter,House / apartment,48,Yes,12,Yes
24
+ 22,Yes,No,No,Yes,No,1,2,26000,29,Student,Higher education,Single,Student,With parents,12,No,0,No
25
+ 23,No,Yes,Yes,Yes,Yes,0,1,45000,55,Pensioner,Lower secondary,Divorced,Retired,House / apartment,180,No,0,No
26
+ 24,Yes,Yes,No,Yes,Yes,1,3,80000,46,Salaried,Higher education,Married,Architect,House / apartment,72,Yes,18,Yes
27
+ 25,No,No,Yes,No,No,2,5,37000,33,Self-employed,Secondary,Single,Plumber,With parents,24,Yes,6,Yes
28
+ 26,Yes,Yes,Yes,Yes,No,0,1,65000,28,Self-employed,Higher education,Single,Graphic Designer,Rented apartment,36,Yes,4,Yes
29
+ 27,No,No,No,No,Yes,1,2,29000,53,Pensioner,Incomplete higher,Widow,Retired,House / apartment,144,No,0,No
30
+ 28,Yes,Yes,Yes,No,Yes,3,6,58000,39,Salaried,Secondary,Divorced,Teacher,House / apartment,60,Yes,10,Yes
31
+ 29,No,Yes,No,Yes,No,0,1,40000,31,Salaried,Higher education,Single,Nurse,Rented apartment,48,Yes,7,Yes
32
+ 30,Yes,No,Yes,Yes,Yes,2,4,51000,42,Self-employed,Higher education,Civil marriage,Business Owner,House / apartment,60,Yes,14,Yes
33
+ 31,No,Yes,No,No,No,1,3,34000,47,Salaried,Secondary,Married,Factory Worker,With parents,72,Yes,9,No
34
+ 32,No,No,Yes,Yes,Yes,0,1,85000,25,Self-employed,Higher education,Single,Entrepreneur,Rented apartment,12,Yes,2,Yes
35
+ 33,Yes,Yes,No,No,No,3,5,30000,55,Salaried,Secondary,Married,Teacher,House / apartment,240,Yes,30,Yes
36
+ 34,No,No,Yes,Yes,Yes,1,2,45000,40,Salaried,Higher education,Divorced,Nurse,With parents,36,Yes,10,Yes
37
+ 35,Yes,No,No,Yes,No,0,1,20000,50,Pensioner,Lower secondary,Widow,Retired,Rented apartment,60,No,0,No
38
+ 36,No,Yes,Yes,No,Yes,2,4,35000,30,Salaried,Higher education,Civil marriage,Clerk,House / apartment,24,Yes,5,Yes
39
+ 37,Yes,No,Yes,Yes,No,1,3,60000,33,Self-employed,Secondary,Single,Freelancer,Rented apartment,48,Yes,7,No
40
+ 38,No,No,No,No,Yes,0,1,75000,28,Salaried,Higher education,Single,Manager,With parents,12,Yes,3,Yes
41
+ 39,Yes,Yes,Yes,Yes,No,3,6,40000,45,Salaried,Incomplete higher,Married,Salesperson,House / apartment,96,Yes,15,Yes
42
+ 40,Yes,No,Yes,Yes,Yes,0,1,30000,65,Pensioner,Higher education,Widow,Retired,House / apartment,420,No,40,Yes
43
+ 41,No,Yes,No,No,No,1,2,22000,26,Salaried,Secondary,Single,Junior Clerk,With parents,24,Yes,2,No
44
+ 42,Yes,No,Yes,Yes,No,2,4,18000,35,Salaried,Lower secondary,Married,Factory Worker,House / apartment,60,Yes,10,Yes
45
+ 43,No,No,No,Yes,Yes,0,1,50000,55,Self-employed,Higher education,Divorced,Consultant,Rented apartment,180,Yes,30,Yes
46
+ 44,Yes,Yes,Yes,No,No,3,5,25000,40,Salaried,Incomplete higher,Married,Cook,House / apartment,120,Yes,15,No
47
+ 45,No,Yes,No,Yes,Yes,1,2,35000,28,Self-employed,Secondary,Single,Graphic Designer,Rented apartment,36,Yes,5,Yes
48
+ 46,Yes,No,Yes,No,Yes,0,1,45000,30,Salaried,Higher education,Civil marriage,Engineer,With parents,48,Yes,8,Yes
49
+ 47,No,Yes,Yes,Yes,No,2,3,27000,48,Salaried,Secondary,Divorced,Salesperson,House / apartment,240,Yes,20,No
50
+ 48,Yes,No,Yes,Yes,No,2,4,40000,45,Self-employed,Secondary,Married,Artist,House / apartment,120,Yes,15,Yes
51
+ 49,No,Yes,No,No,Yes,0,1,55000,50,Salaried,Higher education,Widow,Scientist,Rented apartment,300,No,25,Yes
52
+ 50,Yes,No,Yes,Yes,Yes,1,2,26000,27,Salaried,Incomplete higher,Single,Junior Accountant,With parents,36,Yes,3,No
53
+ 51,No,No,No,No,No,3,6,22000,38,Salaried,Lower secondary,Divorced,Bus Driver,House / apartment,72,Yes,10,Yes
54
+ 52,Yes,Yes,Yes,No,Yes,0,1,32000,55,Pensioner,Higher education,Married,Retired,House / apartment,420,No,30,No
55
+ 53,No,Yes,No,Yes,No,2,3,29000,33,Self-employed,Secondary,Civil marriage,Shop Owner,Rented apartment,60,Yes,7,Yes
56
+ 54,Yes,No,Yes,Yes,Yes,1,2,68000,29,Salaried,Higher education,Single,Software Developer,With parents,48,Yes,6,Yes
57
+ 55,No,No,No,No,Yes,0,1,36000,42,Self-employed,Incomplete higher,Divorced,Freelancer,House / apartment,96,Yes,12,No
deployment_files/client.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a407200f51839a85032c3f69ca3413e9cdb99d2e03722f03450599d2f8d318d
3
- size 76138
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:859c5895ff365bfd7fa8b592784717d14094c29aaee032c2f23716d225c15855
3
+ size 29354
deployment_files/pre_processor_bank.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c76179d6cb576a61f6d57e29f45545d8c4dd2d86a3818730f5c6bdd35faebe4a
3
+ size 1098
deployment_files/pre_processor_third_party.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:223d27a2af60a06845da0191cf725b7bab7f31ebb0a84bc530cbd32001ac4f47
3
- size 1588
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bc9d12a6ac40e3b7f88f3f15e688879b819bc4a995ad0e91fc08af85f55513b
3
+ size 1128
deployment_files/pre_processor_user.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60f9d3d6e98f3eadc7a30340dae414df3f42fabd73da01fe0a6ece91186964b5
3
- size 6222
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4300414ee222355b2ac9e0f5e4dc21b50f0a4de8e17e0ed4121dc766231d010
3
+ size 3340
deployment_files/server.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98a6a55f0be2a5fc9b2174b9a9686550fe2a89c3c2ae937aeb31e4057f645a03
3
- size 3419
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3490a405f8634b3c95f6a2d3fb8cf0276e3f49adc4b25911afab7f97524c2f7a
3
+ size 2308
development.py CHANGED
@@ -5,16 +5,12 @@ import numpy
5
  import pandas
6
  import pickle
7
 
8
- from sklearn.model_selection import train_test_split
9
- from sklearn.metrics import accuracy_score
10
- from imblearn.over_sampling import SMOTE
11
-
12
  from settings import (
13
- DEPLOYMENT_PATH,
14
- RANDOM_STATE,
15
  DATA_PATH,
16
  INPUT_SLICES,
17
  PRE_PROCESSOR_USER_PATH,
 
18
  PRE_PROCESSOR_THIRD_PARTY_PATH,
19
  USER_COLUMNS,
20
  BANK_COLUMNS,
@@ -34,19 +30,12 @@ def get_processed_multi_inputs(data):
34
 
35
  print("Load and pre-process the data")
36
 
37
- # Original data set can be found here :
38
- # https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction/data
39
- # It then has been cleaned using the following notebook :
40
- # https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
41
- # A few additional pre-processing steps has bee applied to this data set as well :
42
- # - "ID" column has been removed
43
- # - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
44
- # salary one from 2023 (22050 euros)
45
  data = pandas.read_csv(DATA_PATH, encoding="utf-8")
46
 
47
  # Define input and target data
48
  data_x = data.copy()
49
- data_y = data_x.pop("Target").copy()
50
 
51
  # Get data from all parties
52
  data_user = data_x[USER_COLUMNS].copy()
@@ -54,30 +43,22 @@ data_bank = data_x[BANK_COLUMNS].copy()
54
  data_third_party = data_x[THIRD_PARTY_COLUMNS].copy()
55
 
56
  # Feature engineer the data
57
- pre_processor_user, pre_processor_third_party = get_pre_processors()
58
 
59
  preprocessed_data_user = pre_processor_user.fit_transform(data_user)
60
- preprocessed_data_bank = data_bank.to_numpy()
61
  preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party)
62
 
63
  preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1)
64
 
65
- # The initial data-set is very imbalanced: use SMOTE to get better results
66
- x, y = SMOTE().fit_resample(preprocessed_data_x, data_y)
67
-
68
- # Retrieve the training and testing data
69
- X_train, X_test, y_train, y_test = train_test_split(
70
- x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE
71
- )
72
-
73
 
74
  print("\nTrain and compile the model")
75
 
76
- model = MultiInputXGBClassifier(max_depth=3, n_estimators=40)
77
 
78
- model, sklearn_model = model.fit_benchmark(X_train, y_train)
79
 
80
- multi_inputs_train = get_processed_multi_inputs(X_train)
81
 
82
  model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])
83
 
@@ -86,19 +67,6 @@ if DEPLOYMENT_PATH.is_dir():
86
  shutil.rmtree(DEPLOYMENT_PATH)
87
 
88
 
89
- print("\nEvaluate the models")
90
-
91
- y_pred_sklearn = sklearn_model.predict(X_test)
92
-
93
- print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%")
94
-
95
- multi_inputs_test = get_processed_multi_inputs(X_test)
96
-
97
- y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True)
98
-
99
- print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%")
100
-
101
-
102
  print("\nSave deployment files")
103
 
104
  # Save files needed for deployment (and enable cross-platform deployment)
@@ -106,10 +74,13 @@ fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model)
106
  fhe_dev.save(via_mlir=True)
107
 
108
  # Save pre-processors
109
- with PRE_PROCESSOR_USER_PATH.open('wb') as file:
110
- pickle.dump(pre_processor_user, file)
111
-
112
- with PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file:
113
- pickle.dump(pre_processor_third_party, file)
 
 
 
114
 
115
  print("\nDone !")
 
5
  import pandas
6
  import pickle
7
 
 
 
 
 
8
  from settings import (
9
+ DEPLOYMENT_PATH,
 
10
  DATA_PATH,
11
  INPUT_SLICES,
12
  PRE_PROCESSOR_USER_PATH,
13
+ PRE_PROCESSOR_BANK_PATH,
14
  PRE_PROCESSOR_THIRD_PARTY_PATH,
15
  USER_COLUMNS,
16
  BANK_COLUMNS,
 
30
 
31
  print("Load and pre-process the data")
32
 
33
+ # Load the data
 
 
 
 
 
 
 
34
  data = pandas.read_csv(DATA_PATH, encoding="utf-8")
35
 
36
  # Define input and target data
37
  data_x = data.copy()
38
+ data_y = data_x.pop("Target").copy().to_frame()
39
 
40
  # Get data from all parties
41
  data_user = data_x[USER_COLUMNS].copy()
 
43
  data_third_party = data_x[THIRD_PARTY_COLUMNS].copy()
44
 
45
  # Feature engineer the data
46
+ pre_processor_user, pre_processor_bank, pre_processor_third_party = get_pre_processors()
47
 
48
  preprocessed_data_user = pre_processor_user.fit_transform(data_user)
49
+ preprocessed_data_bank = pre_processor_bank.fit_transform(data_bank)
50
  preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party)
51
 
52
  preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1)
53
 
 
 
 
 
 
 
 
 
54
 
55
  print("\nTrain and compile the model")
56
 
57
+ model = MultiInputXGBClassifier(max_depth=3, n_estimators=20)
58
 
59
+ model, sklearn_model = model.fit_benchmark(preprocessed_data_x, data_y)
60
 
61
+ multi_inputs_train = get_processed_multi_inputs(preprocessed_data_x)
62
 
63
  model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])
64
 
 
67
  shutil.rmtree(DEPLOYMENT_PATH)
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  print("\nSave deployment files")
71
 
72
  # Save files needed for deployment (and enable cross-platform deployment)
 
74
  fhe_dev.save(via_mlir=True)
75
 
76
  # Save pre-processors
77
+ with (
78
+ PRE_PROCESSOR_USER_PATH.open('wb') as file_user,
79
+ PRE_PROCESSOR_BANK_PATH.open('wb') as file_bank,
80
+ PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file_third_party,
81
+ ):
82
+ pickle.dump(pre_processor_user, file_user)
83
+ pickle.dump(pre_processor_bank, file_bank)
84
+ pickle.dump(pre_processor_third_party, file_third_party)
85
 
86
  print("\nDone !")
requirements.txt CHANGED
@@ -1,3 +1,2 @@
1
  gradio==3.40.1
2
- concrete-ml==1.3.0
3
- imbalanced-learn==0.11.0
 
1
  gradio==3.40.1
2
+ concrete-ml==1.3.0
 
settings.py CHANGED
@@ -14,6 +14,7 @@ SERVER_FILES = REPO_DIR / "server_files"
14
 
15
  # Path targeting pre-processor saved files
16
  PRE_PROCESSOR_USER_PATH = DEPLOYMENT_PATH / 'pre_processor_user.pkl'
 
17
  PRE_PROCESSOR_THIRD_PARTY_PATH = DEPLOYMENT_PATH / 'pre_processor_third_party.pkl'
18
 
19
  # Create the necessary directories
@@ -25,13 +26,10 @@ SERVER_FILES.mkdir(exist_ok=True)
25
  SERVER_URL = "http://localhost:8000/"
26
 
27
  # Path to data file
28
- # Details about pre-processing steps can be found in the 'development.py' and 'pre_processing.py'
29
- # files
30
  DATA_PATH = "data/data.csv"
31
 
32
  # Development settings
33
- RANDOM_STATE = 0
34
- INITIAL_INPUT_SHAPE = (1, 49)
35
 
36
  CLIENT_TYPES = ["user", "bank", "third_party"]
37
  INPUT_INDEXES = {
@@ -40,13 +38,13 @@ INPUT_INDEXES = {
40
  "third_party": 2,
41
  }
42
  INPUT_SLICES = {
43
- "user": slice(0, 42), # First position: start from 0
44
- "bank": slice(42, 43), # Second position: start from n_feature_user
45
- "third_party": slice(43, 49), # Third position: start from n_feature_user + n_feature_bank
46
  }
47
 
48
  USER_COLUMNS = [
49
- 'Own_car', 'Own_property', 'Work_phone', 'Phone', 'Email', 'Num_children', 'Household_size',
50
  'Total_income', 'Age', 'Income_type', 'Education_type', 'Family_status', 'Housing_type',
51
  'Occupation_type',
52
  ]
 
14
 
15
  # Path targeting pre-processor saved files
16
  PRE_PROCESSOR_USER_PATH = DEPLOYMENT_PATH / 'pre_processor_user.pkl'
17
+ PRE_PROCESSOR_BANK_PATH = DEPLOYMENT_PATH / 'pre_processor_bank.pkl'
18
  PRE_PROCESSOR_THIRD_PARTY_PATH = DEPLOYMENT_PATH / 'pre_processor_third_party.pkl'
19
 
20
  # Create the necessary directories
 
26
  SERVER_URL = "http://localhost:8000/"
27
 
28
  # Path to data file
 
 
29
  DATA_PATH = "data/data.csv"
30
 
31
  # Development settings
32
+ PROCESSED_INPUT_SHAPE = (1, 39)
 
33
 
34
  CLIENT_TYPES = ["user", "bank", "third_party"]
35
  INPUT_INDEXES = {
 
38
  "third_party": 2,
39
  }
40
  INPUT_SLICES = {
41
+ "user": slice(0, 36), # First position: start from 0
42
+ "bank": slice(36, 37), # Second position: start from n_feature_user
43
+ "third_party": slice(37, 39), # Third position: start from n_feature_user + n_feature_bank
44
  }
45
 
46
  USER_COLUMNS = [
47
+ 'Own_car', 'Own_property', 'Mobile_phone', 'Num_children', 'Household_size',
48
  'Total_income', 'Age', 'Income_type', 'Education_type', 'Family_status', 'Housing_type',
49
  'Occupation_type',
50
  ]
utils/client_server_interface.py CHANGED
@@ -27,9 +27,15 @@ class MultiInputsFHEModelClient(FHEModelClient):
27
 
28
  super().__init__(*args, **kwargs)
29
 
30
- def quantize_encrypt_serialize_multi_inputs(self, x: numpy.ndarray, input_index, initial_input_shape, input_slice) -> bytes:
 
 
 
 
 
 
31
 
32
- x_padded = numpy.zeros(initial_input_shape)
33
 
34
  x_padded[:, input_slice] = x
35
 
 
27
 
28
  super().__init__(*args, **kwargs)
29
 
30
+ def quantize_encrypt_serialize_multi_inputs(
31
+ self,
32
+ x: numpy.ndarray,
33
+ input_index,
34
+ processed_input_shape,
35
+ input_slice
36
+ ) -> bytes:
37
 
38
+ x_padded = numpy.zeros(processed_input_shape)
39
 
40
  x_padded[:, input_slice] = x
41
 
utils/pre_processing.py CHANGED
@@ -1,16 +1,9 @@
1
- """Data pre-processing functions.
2
-
3
- The pre-processing steps are heavily inspired by the following notebook :
4
- https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml
5
-
6
- Additional steps, mostly including renaming some values or features, were added for better user
7
- experience.
8
- """
9
 
10
  import numpy
11
  from sklearn.compose import ColumnTransformer
12
  from sklearn.pipeline import Pipeline
13
- from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer
14
 
15
 
16
  def _get_pipeline_replace_one_hot(func, value):
@@ -23,10 +16,6 @@ def _get_pipeline_replace_one_hot(func, value):
23
  ("one_hot", OneHotEncoder(),),
24
  ])
25
 
26
-
27
- def _replace_values_geq(column, value):
28
- return numpy.where(column >= value, f"{value}_or_more", column)
29
-
30
  def _replace_values_eq(column, value):
31
  for desired_value, values_to_replace in value.items():
32
  column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
@@ -35,41 +24,32 @@ def _replace_values_eq(column, value):
35
  def get_pre_processors():
36
  pre_processor_user = ColumnTransformer(
37
  transformers=[
38
- (
39
- "replace_num_children",
40
- _get_pipeline_replace_one_hot(_replace_values_geq, 2),
41
- ['Num_children']
42
- ),
43
- (
44
- "replace_household_size",
45
- _get_pipeline_replace_one_hot(_replace_values_geq, 3),
46
- ['Household_size']
47
- ),
48
- (
49
- "replace_income_type",
50
- _get_pipeline_replace_one_hot(_replace_values_eq, {"Public Sector": ["Retired", "Student"]}),
51
- ['Income_type']
52
- ),
53
- (
54
- "replace_education_type",
55
- _get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}),
56
- ['Education_type']
57
- ),
58
  (
59
  "replace_occupation_type_labor",
60
  _get_pipeline_replace_one_hot(
61
  _replace_values_eq,
62
  {
63
- "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-wage laborers", "Security staff", "Waiters/barmen staff"],
64
- "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
65
- "High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
 
 
 
 
66
  },
67
  ),
68
  ['Occupation_type']
69
  ),
70
- ('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']),
71
- ('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']),
72
- ('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']),
 
 
 
 
 
 
 
73
  ],
74
  remainder='passthrough',
75
  verbose_feature_names_out=False,
@@ -77,10 +57,10 @@ def get_pre_processors():
77
 
78
  pre_processor_third_party = ColumnTransformer(
79
  transformers=[
80
- ('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed'])
81
  ],
82
  remainder='passthrough',
83
  verbose_feature_names_out=False,
84
  )
85
 
86
- return pre_processor_user, pre_processor_third_party
 
1
+ """Data pre-processing functions."""
 
 
 
 
 
 
 
2
 
3
  import numpy
4
  from sklearn.compose import ColumnTransformer
5
  from sklearn.pipeline import Pipeline
6
+ from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
7
 
8
 
9
  def _get_pipeline_replace_one_hot(func, value):
 
16
  ("one_hot", OneHotEncoder(),),
17
  ])
18
 
 
 
 
 
19
  def _replace_values_eq(column, value):
20
  for desired_value, values_to_replace in value.items():
21
  column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
 
24
  def get_pre_processors():
25
  pre_processor_user = ColumnTransformer(
26
  transformers=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  (
28
  "replace_occupation_type_labor",
29
  _get_pipeline_replace_one_hot(
30
  _replace_values_eq,
31
  {
32
+ "Labor_work": [
33
+ "Cooking Staff", "Carpenter", "Plumber", "Factory Worker", "Bus Driver"
34
+ ],
35
+ "Office_work": [
36
+ "Business Owners", "Office Worker", "Accountant", "Entrepreneur", "Salesperson"
37
+ ],
38
+ "High_tech_work": ["Engineer", "Manager", "Consultant", "Software Developer"],
39
  },
40
  ),
41
  ['Occupation_type']
42
  ),
43
+ ('one_hot_others', OneHotEncoder(), ['Housing_type', 'Family_status', 'Education_type', 'Income_type']),
44
+ ('standard_scaler', StandardScaler(), ['Num_children', 'Household_size', 'Total_income', 'Age']),
45
+ ],
46
+ remainder='passthrough',
47
+ verbose_feature_names_out=False,
48
+ )
49
+
50
+ pre_processor_bank = ColumnTransformer(
51
+ transformers=[
52
+ ('standard_scaler', StandardScaler(), ['Account_age']),
53
  ],
54
  remainder='passthrough',
55
  verbose_feature_names_out=False,
 
57
 
58
  pre_processor_third_party = ColumnTransformer(
59
  transformers=[
60
+ ('standard_scaler', StandardScaler(), ['Years_employed']),
61
  ],
62
  remainder='passthrough',
63
  verbose_feature_names_out=False,
64
  )
65
 
66
+ return pre_processor_user, pre_processor_bank, pre_processor_third_party