Spaces:

zama-fhe
/

encrypted_credit_scoring

Running

App Files Files Community

romanbredehoft-zama commited on Nov 28, 2023

Commit

993f2a6

1 Parent(s): 747c295

Replace outdated terminologies and remove some features

Browse files

Files changed (11) hide show

app.py +1 -2
backend.py +13 -19
data/clean_data.csv +0 -0
data/data.csv +0 -0
deployment_files/client.zip +2 -2
deployment_files/pre_processor_third_party.pkl +2 -2
deployment_files/pre_processor_user.pkl +2 -2
deployment_files/server.zip +2 -2
development.py +2 -2
settings.py +4 -5
utils/pre_processing.py +4 -4

app.py CHANGED Viewed

@@ -77,7 +77,6 @@ with demo:
     with gr.Row():
         with gr.Column():
             gr.Markdown("### User")
-            gender = gr.Radio(["Female", "Male"], label="Gender", value="Female")
             bool_inputs = gr.CheckboxGroup(["Car", "Property", "Work phone", "Phone", "Email"], label="What do you own ?")
             num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
             household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
@@ -185,7 +184,7 @@ with demo:
     # side to the server
     encrypt_button_user.click(
         pre_process_encrypt_send_user,
-        inputs=[client_id, gender, bool_inputs, num_children, household_size, total_income, age, \
                 income_type, education_type, family_status, occupation_type, housing_type],
         outputs=[encrypted_input_user],
     )

     with gr.Row():
         with gr.Column():
             gr.Markdown("### User")
             bool_inputs = gr.CheckboxGroup(["Car", "Property", "Work phone", "Phone", "Email"], label="What do you own ?")
             num_children = gr.Slider(**CHILDREN_MIN_MAX, step=1, label="Number of children", info="How many children do you have ?")
             household_size = gr.Slider(**FAMILY_MIN_MAX, step=1, label="Household size", info="How many members does your household have? ?")
     # side to the server
     encrypt_button_user.click(
         pre_process_encrypt_send_user,
+        inputs=[client_id, bool_inputs, num_children, household_size, total_income, age, \
                 income_type, education_type, family_status, occupation_type, housing_type],
         outputs=[encrypted_input_user],
     )

backend.py CHANGED Viewed

@@ -243,12 +243,8 @@ def pre_process_encrypt_send_user(client_id, *inputs):
         (int, bytes): Integer ID representing the current client and a byte short representation of
             the encrypted input to send.
     """
-    gender, bool_inputs, num_children, household_size, total_income, age, income_type, education_type, \
         family_status, occupation_type, housing_type = inputs
-    # Encoding given in https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
-    # for "Gender" is M ('Male') -> 1 and F ('Female') -> 0
-    gender = gender == "Male"
     # Retrieve boolean values
     own_car = "Car" in bool_inputs
@@ -258,21 +254,20 @@ def pre_process_encrypt_send_user(client_id, *inputs):
     email = "Email" in bool_inputs
     user_inputs = pandas.DataFrame({
-        "Gender": [gender],
         "Own_car": [own_car],
         "Own_property": [own_property],
         "Work_phone": [work_phone],
         "Phone": [phone],
         "Email": [email],
-        "Num_children": num_children,
-        "Num_family": household_size,
-        "Total_income": total_income,
-        "Age": age,
-        "Income_type": income_type,
-        "Education_type": education_type,
-        "Family_status": family_status,
-        "Occupation_type": occupation_type,
-        "Housing_type": housing_type,
     })
     preprocessed_user_inputs = PRE_PROCESSOR_USER.transform(user_inputs)
@@ -308,12 +303,11 @@ def pre_process_encrypt_send_third_party(client_id, *inputs):
             the encrypted input to send.
     """
     salaried, years_salaried = inputs
-    # Original dataset contains an "unemployed" feature instead of "employed"
-    unemployed = salaried == "No"
     third_party_inputs = pandas.DataFrame({
-        "Unemployed": [unemployed],
         "Years_employed": [years_salaried],
     })

         (int, bytes): Integer ID representing the current client and a byte short representation of
             the encrypted input to send.
     """
+    bool_inputs, num_children, household_size, total_income, age, income_type, education_type, \
         family_status, occupation_type, housing_type = inputs
     # Retrieve boolean values
     own_car = "Car" in bool_inputs
     email = "Email" in bool_inputs
     user_inputs = pandas.DataFrame({
         "Own_car": [own_car],
         "Own_property": [own_property],
         "Work_phone": [work_phone],
         "Phone": [phone],
         "Email": [email],
+        "Num_children": [num_children],
+        "Household_size": [household_size],
+        "Total_income": [total_income],
+        "Age": [age],
+        "Income_type": [income_type],
+        "Education_type": [education_type],
+        "Family_status": [family_status],
+        "Occupation_type": [occupation_type],
+        "Housing_type": [housing_type],
     })
     preprocessed_user_inputs = PRE_PROCESSOR_USER.transform(user_inputs)
             the encrypted input to send.
     """
     salaried, years_salaried = inputs
+    is_salaried = salaried == "Yes"
     third_party_inputs = pandas.DataFrame({
+        "Salaried": [is_salaried],
         "Years_employed": [years_salaried],
     })

data/clean_data.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data/data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

deployment_files/client.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86307f54a5d224ee89cfe62ba541e752fcd6ef33d6a427b2d1fc45c76a44818f
-size 76534

 version https://git-lfs.github.com/spec/v1
+oid sha256:a4c826efceb2e6c4d9fd1d3876d7adae10537814add6ae3f08b5dab9ae23f76b
+size 76339

deployment_files/pre_processor_third_party.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee39c00c8ca119a4e61f6905687c9bb540352b5ce4005aaba125290679722587
-size 1590

 version https://git-lfs.github.com/spec/v1
+oid sha256:999ae2b4f9420d6b2e2987035cde0f43e93b2edb85ac32b3c877584b45871ca8
+size 1588

deployment_files/pre_processor_user.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af3db3f40e0e38febb8efb858e07df1f432458cc66f2edb38bedbd4d35520802
-size 6207

 version https://git-lfs.github.com/spec/v1
+oid sha256:8454f6f929f89b5dec427d9f6e522a33cde0a49c8dc6f06a650bb0bf90b59913
+size 6221

deployment_files/server.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f626ffe46861078806c03cd633cb13353d36c294e3cd411f59252fc88dd2ea9
-size 3326

 version https://git-lfs.github.com/spec/v1
+oid sha256:adc3d696a290278148d2ac906018a3a58d3c545290f6fdb60a82a3f2e7eea531
+size 3322

development.py CHANGED Viewed

@@ -31,7 +31,7 @@ print("Load and pre-process the data")
 # A few additional pre-processing steps has bee applied to this data set as well :
 # - "ID" column has been removed
 # - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
-#    salary one from 2023 (in euros)
 data = pandas.read_csv(DATA_PATH, encoding="utf-8")
 # Define input and target data
@@ -39,7 +39,7 @@ data_y = data.pop("Target").copy()
 data_x = data.copy()
 # Get data from all parties
-data_third_party = select_and_pop_features(data_x, ["Years_employed", "Unemployed"])
 data_bank = select_and_pop_features(data_x, ["Account_length"])
 data_user = data_x.copy()

 # A few additional pre-processing steps has bee applied to this data set as well :
 # - "ID" column has been removed
 # - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
+#    salary one from 2023 (22050 euros)
 data = pandas.read_csv(DATA_PATH, encoding="utf-8")
 # Define input and target data
 data_x = data.copy()
 # Get data from all parties
+data_third_party = select_and_pop_features(data_x, ["Years_employed", "Salaried"])
 data_bank = select_and_pop_features(data_x, ["Account_length"])
 data_user = data_x.copy()

settings.py CHANGED Viewed

@@ -25,10 +25,9 @@ SERVER_FILES.mkdir(exist_ok=True)
 SERVER_URL = "http://localhost:8000/"
 # Path to data file
-# The data was previously cleaned using this notebook : https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
-# Additionally, the "ID" columns has been removed and the "Total_income" has been adjusted so that
-# its median value corresponds to France's 2023 median annual salary (22050 euros)
-DATA_PATH = "data/clean_data.csv"
 # Developement settings
 RANDOM_STATE = 0
@@ -61,7 +60,7 @@ CHILDREN_MIN_MAX = get_min_max(_data, "Num_children")
 INCOME_MIN_MAX = get_min_max(_data, "Total_income")
 AGE_MIN_MAX = get_min_max(_data, "Age")
 SALARIED_MIN_MAX = get_min_max(_data, "Years_employed")
-FAMILY_MIN_MAX = get_min_max(_data, "Num_family")
 # App data choices
 INCOME_TYPES = list(_data["Income_type"].unique())

 SERVER_URL = "http://localhost:8000/"
 # Path to data file
+# Details about pre-processing steps can be found in the 'development.py' and 'pre_processing.py'
+# files
+DATA_PATH = "data/data.csv"
 # Developement settings
 RANDOM_STATE = 0
 INCOME_MIN_MAX = get_min_max(_data, "Total_income")
 AGE_MIN_MAX = get_min_max(_data, "Age")
 SALARIED_MIN_MAX = get_min_max(_data, "Years_employed")
+FAMILY_MIN_MAX = get_min_max(_data, "Household_size")
 # App data choices
 INCOME_TYPES = list(_data["Income_type"].unique())

utils/pre_processing.py CHANGED Viewed

@@ -41,13 +41,13 @@ def get_pre_processors():
                 ['Num_children']
             ),
             (
-                "replace_num_family",
                 _get_pipeline_replace_one_hot(_replace_values_geq, 3),
-                ['Num_family']
             ),
             (
                 "replace_income_type",
-                _get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}),
                 ['Income_type']
             ),
             (
@@ -60,7 +60,7 @@ def get_pre_processors():
                 _get_pipeline_replace_one_hot(
                     _replace_values_eq,
                         {
-                            "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"],
                             "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
                             "High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
                         },

                 ['Num_children']
             ),
             (
+                "replace_household_size",
                 _get_pipeline_replace_one_hot(_replace_values_geq, 3),
+                ['Household_size']
             ),
             (
                 "replace_income_type",
+                _get_pipeline_replace_one_hot(_replace_values_eq, {"Public Sector": ["Retired", "Student"]}),
                 ['Income_type']
             ),
             (
                 _get_pipeline_replace_one_hot(
                     _replace_values_eq,
                         {
+                            "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-wage laborers", "Security staff", "Waiters/barmen staff"],
                             "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
                             "High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
                         },