romanbredehoft-zama commited on
Commit
993f2a6
1 Parent(s): 747c295

Replace outdated terminologies and remove some features

Browse files
app.py CHANGED
@@ -77,7 +77,6 @@ with demo:
77
  with gr.Row():
78
  with gr.Column():
79
  gr.Markdown("### User")
80
- gender = gr.Radio(["Female", "Male"], label="Gender", value="Female")
81
  bool_inputs = gr.CheckboxGroup(["Car", "Property", "Work phone", "Phone", "Email"], label="What do you own ?")
82
  num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
83
  household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
@@ -185,7 +184,7 @@ with demo:
185
  # side to the server
186
  encrypt_button_user.click(
187
  pre_process_encrypt_send_user,
188
- inputs=[client_id, gender, bool_inputs, num_children, household_size, total_income, age, \
189
  income_type, education_type, family_status, occupation_type, housing_type],
190
  outputs=[encrypted_input_user],
191
  )
 
77
  with gr.Row():
78
  with gr.Column():
79
  gr.Markdown("### User")
 
80
  bool_inputs = gr.CheckboxGroup(["Car", "Property", "Work phone", "Phone", "Email"], label="What do you own ?")
81
  num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
82
  household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
 
184
  # side to the server
185
  encrypt_button_user.click(
186
  pre_process_encrypt_send_user,
187
+ inputs=[client_id, bool_inputs, num_children, household_size, total_income, age, \
188
  income_type, education_type, family_status, occupation_type, housing_type],
189
  outputs=[encrypted_input_user],
190
  )
backend.py CHANGED
@@ -243,12 +243,8 @@ def pre_process_encrypt_send_user(client_id, *inputs):
243
  (int, bytes): Integer ID representing the current client and a byte short representation of
244
  the encrypted input to send.
245
  """
246
- gender, bool_inputs, num_children, household_size, total_income, age, income_type, education_type, \
247
  family_status, occupation_type, housing_type = inputs
248
-
249
- # Encoding given in https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
250
- # for "Gender" is M ('Male') -> 1 and F ('Female') -> 0
251
- gender = gender == "Male"
252
 
253
  # Retrieve boolean values
254
  own_car = "Car" in bool_inputs
@@ -258,21 +254,20 @@ def pre_process_encrypt_send_user(client_id, *inputs):
258
  email = "Email" in bool_inputs
259
 
260
  user_inputs = pandas.DataFrame({
261
- "Gender": [gender],
262
  "Own_car": [own_car],
263
  "Own_property": [own_property],
264
  "Work_phone": [work_phone],
265
  "Phone": [phone],
266
  "Email": [email],
267
- "Num_children": num_children,
268
- "Num_family": household_size,
269
- "Total_income": total_income,
270
- "Age": age,
271
- "Income_type": income_type,
272
- "Education_type": education_type,
273
- "Family_status": family_status,
274
- "Occupation_type": occupation_type,
275
- "Housing_type": housing_type,
276
  })
277
 
278
  preprocessed_user_inputs = PRE_PROCESSOR_USER.transform(user_inputs)
@@ -308,12 +303,11 @@ def pre_process_encrypt_send_third_party(client_id, *inputs):
308
  the encrypted input to send.
309
  """
310
  salaried, years_salaried = inputs
311
-
312
- # Original dataset contains an "unemployed" feature instead of "employed"
313
- unemployed = salaried == "No"
314
 
315
  third_party_inputs = pandas.DataFrame({
316
- "Unemployed": [unemployed],
317
  "Years_employed": [years_salaried],
318
  })
319
 
 
243
  (int, bytes): Integer ID representing the current client and a byte short representation of
244
  the encrypted input to send.
245
  """
246
+ bool_inputs, num_children, household_size, total_income, age, income_type, education_type, \
247
  family_status, occupation_type, housing_type = inputs
 
 
 
 
248
 
249
  # Retrieve boolean values
250
  own_car = "Car" in bool_inputs
 
254
  email = "Email" in bool_inputs
255
 
256
  user_inputs = pandas.DataFrame({
 
257
  "Own_car": [own_car],
258
  "Own_property": [own_property],
259
  "Work_phone": [work_phone],
260
  "Phone": [phone],
261
  "Email": [email],
262
+ "Num_children": [num_children],
263
+ "Household_size": [household_size],
264
+ "Total_income": [total_income],
265
+ "Age": [age],
266
+ "Income_type": [income_type],
267
+ "Education_type": [education_type],
268
+ "Family_status": [family_status],
269
+ "Occupation_type": [occupation_type],
270
+ "Housing_type": [housing_type],
271
  })
272
 
273
  preprocessed_user_inputs = PRE_PROCESSOR_USER.transform(user_inputs)
 
303
  the encrypted input to send.
304
  """
305
  salaried, years_salaried = inputs
306
+
307
+ is_salaried = salaried == "Yes"
 
308
 
309
  third_party_inputs = pandas.DataFrame({
310
+ "Salaried": [is_salaried],
311
  "Years_employed": [years_salaried],
312
  })
313
 
data/clean_data.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/data.csv ADDED
The diff for this file is too large to render. See raw diff
 
deployment_files/client.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86307f54a5d224ee89cfe62ba541e752fcd6ef33d6a427b2d1fc45c76a44818f
3
- size 76534
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c826efceb2e6c4d9fd1d3876d7adae10537814add6ae3f08b5dab9ae23f76b
3
+ size 76339
deployment_files/pre_processor_third_party.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee39c00c8ca119a4e61f6905687c9bb540352b5ce4005aaba125290679722587
3
- size 1590
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:999ae2b4f9420d6b2e2987035cde0f43e93b2edb85ac32b3c877584b45871ca8
3
+ size 1588
deployment_files/pre_processor_user.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af3db3f40e0e38febb8efb858e07df1f432458cc66f2edb38bedbd4d35520802
3
- size 6207
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8454f6f929f89b5dec427d9f6e522a33cde0a49c8dc6f06a650bb0bf90b59913
3
+ size 6221
deployment_files/server.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f626ffe46861078806c03cd633cb13353d36c294e3cd411f59252fc88dd2ea9
3
- size 3326
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adc3d696a290278148d2ac906018a3a58d3c545290f6fdb60a82a3f2e7eea531
3
+ size 3322
development.py CHANGED
@@ -31,7 +31,7 @@ print("Load and pre-process the data")
31
  # A few additional pre-processing steps has bee applied to this data set as well :
32
  # - "ID" column has been removed
33
  # - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
34
- # salary one from 2023 (in euros)
35
  data = pandas.read_csv(DATA_PATH, encoding="utf-8")
36
 
37
  # Define input and target data
@@ -39,7 +39,7 @@ data_y = data.pop("Target").copy()
39
  data_x = data.copy()
40
 
41
  # Get data from all parties
42
- data_third_party = select_and_pop_features(data_x, ["Years_employed", "Unemployed"])
43
  data_bank = select_and_pop_features(data_x, ["Account_length"])
44
  data_user = data_x.copy()
45
 
 
31
  # A few additional pre-processing steps has bee applied to this data set as well :
32
  # - "ID" column has been removed
33
  # - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
34
+ # salary one from 2023 (22050 euros)
35
  data = pandas.read_csv(DATA_PATH, encoding="utf-8")
36
 
37
  # Define input and target data
 
39
  data_x = data.copy()
40
 
41
  # Get data from all parties
42
+ data_third_party = select_and_pop_features(data_x, ["Years_employed", "Salaried"])
43
  data_bank = select_and_pop_features(data_x, ["Account_length"])
44
  data_user = data_x.copy()
45
 
settings.py CHANGED
@@ -25,10 +25,9 @@ SERVER_FILES.mkdir(exist_ok=True)
25
  SERVER_URL = "http://localhost:8000/"
26
 
27
  # Path to data file
28
- # The data was previously cleaned using this notebook : https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
29
- # Additionally, the "ID" columns has been removed and the "Total_income" has been adjusted so that
30
- # its median value corresponds to France's 2023 median annual salary (22050 euros)
31
- DATA_PATH = "data/clean_data.csv"
32
 
33
  # Developement settings
34
  RANDOM_STATE = 0
@@ -61,7 +60,7 @@ CHILDREN_MIN_MAX = get_min_max(_data, "Num_children")
61
  INCOME_MIN_MAX = get_min_max(_data, "Total_income")
62
  AGE_MIN_MAX = get_min_max(_data, "Age")
63
  SALARIED_MIN_MAX = get_min_max(_data, "Years_employed")
64
- FAMILY_MIN_MAX = get_min_max(_data, "Num_family")
65
 
66
  # App data choices
67
  INCOME_TYPES = list(_data["Income_type"].unique())
 
25
  SERVER_URL = "http://localhost:8000/"
26
 
27
  # Path to data file
28
+ # Details about pre-processing steps can be found in the 'development.py' and 'pre_processing.py'
29
+ # files
30
+ DATA_PATH = "data/data.csv"
 
31
 
32
  # Developement settings
33
  RANDOM_STATE = 0
 
60
  INCOME_MIN_MAX = get_min_max(_data, "Total_income")
61
  AGE_MIN_MAX = get_min_max(_data, "Age")
62
  SALARIED_MIN_MAX = get_min_max(_data, "Years_employed")
63
+ FAMILY_MIN_MAX = get_min_max(_data, "Household_size")
64
 
65
  # App data choices
66
  INCOME_TYPES = list(_data["Income_type"].unique())
utils/pre_processing.py CHANGED
@@ -41,13 +41,13 @@ def get_pre_processors():
41
  ['Num_children']
42
  ),
43
  (
44
- "replace_num_family",
45
  _get_pipeline_replace_one_hot(_replace_values_geq, 3),
46
- ['Num_family']
47
  ),
48
  (
49
  "replace_income_type",
50
- _get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}),
51
  ['Income_type']
52
  ),
53
  (
@@ -60,7 +60,7 @@ def get_pre_processors():
60
  _get_pipeline_replace_one_hot(
61
  _replace_values_eq,
62
  {
63
- "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"],
64
  "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
65
  "High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
66
  },
 
41
  ['Num_children']
42
  ),
43
  (
44
+ "replace_household_size",
45
  _get_pipeline_replace_one_hot(_replace_values_geq, 3),
46
+ ['Household_size']
47
  ),
48
  (
49
  "replace_income_type",
50
+ _get_pipeline_replace_one_hot(_replace_values_eq, {"Public Sector": ["Retired", "Student"]}),
51
  ['Income_type']
52
  ),
53
  (
 
60
  _get_pipeline_replace_one_hot(
61
  _replace_values_eq,
62
  {
63
+ "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-wage laborers", "Security staff", "Waiters/barmen staff"],
64
  "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
65
  "High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
66
  },