File size: 5,163 Bytes
4d75828
 
252c32e
 
 
4d75828
 
 
252c32e
4d75828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0cf28e
 
 
4d75828
 
 
c825b1b
4d75828
 
 
 
 
 
 
 
 
 
 
 
 
b85dff4
4d75828
 
 
 
 
2351f00
 
 
 
4d75828
 
 
 
 
 
 
 
 
 
 
 
 
8d90fe7
 
 
 
 
9eab2cb
4d75828
 
 
 
 
 
 
e7ab20c
 
 
4d75828
 
 
e7ab20c
 
 
 
 
4d75828
e7ab20c
0cdc6ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
import tensorflow_decision_forests as tfdf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import gradio as gr
import urllib

input_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income"
input_column_header = "income_level"

#Load data

BASE_PATH = input_path
CSV_HEADER = [ l.decode("utf-8").split(":")[0].replace(" ", "_")
  for l in urllib.request.urlopen(f"{BASE_PATH}.names")
  if not l.startswith(b"|")][2:]

CSV_HEADER.append(input_column_header)

train_data = pd.read_csv(f"{BASE_PATH}.data.gz", header=None, names=CSV_HEADER)
test_data = pd.read_csv(f"{BASE_PATH}.test.gz", header=None, names=CSV_HEADER)

#subset data
train_data = train_data.loc[:, ["education", "sex", "capital_gains", "capital_losses", "income_level"]]
test_data = test_data.loc[:, ["education", "sex", "capital_gains", "capital_losses", "income_level"]]

def encode_df(df):
    sex_mapping = {" Male": 0, " Female": 1}
    df = df.replace({"sex": sex_mapping})
    education_mapping = {" High school graduate": 1, " Some college but no degree": 2, 
                         " 10th grade": 3, " Children": 4, " Bachelors degree(BA AB BS)": 5, 
                         " Masters degree(MA MS MEng MEd MSW MBA)": 6, " Less than 1st grade": 7,
                         " Associates degree-academic program": 8, " 7th and 8th grade": 9,
                         " 12th grade no diploma": 10, " Associates degree-occup /vocational": 11,
                         " Prof school degree (MD DDS DVM LLB JD)": 12, " 5th or 6th grade": 13,
                         " 11th grade": 14, " Doctorate degree(PhD EdD)": 15, " 9th grade": 16,
                         " 1st 2nd 3rd or 4th grade": 17}
    df = df.replace({"education": education_mapping})
    income_mapping = {' - 50000.': 0, ' 50000+.': 1}
    df = df.replace({"income_level": income_mapping})
    return df

train_data = encode_df(train_data)
test_data = encode_df(test_data)

feature_a = tfdf.keras.FeatureUsage(name="education", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)
feature_b = tfdf.keras.FeatureUsage(name="sex", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)
feature_c = tfdf.keras.FeatureUsage(name="capital_gains", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)
feature_d = tfdf.keras.FeatureUsage(name="capital_losses", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)

# Convert the dataset into a TensorFlow dataset.
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_data, label="income_level")
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, label="income_level")

import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Train a GB Trees model
model = tfdf.keras.GradientBoostedTreesModel(
      features = [feature_a, feature_b, feature_c, feature_d],
      exclude_non_specified_features = True,
      growing_strategy = "BEST_FIRST_GLOBAL",
      num_trees = 350,
      max_depth = 7,
      min_examples = 6,
      subsample = 0.65,
      sampling_method = "GOSS",
      validation_ratio = 0.1,
      task = tfdf.keras.Task.CLASSIFICATION,
      loss = "DEFAULT",
      verbose=0)

model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
model.fit(train_ds)
#model.evaluate(test_ds)

#prepare user input for the model
def process_inputs(education, sex, capital_gains, capital_losses):
  df = pd.DataFrame.from_dict(
      {
          "education": [education], 
          "sex": [sex],
          "capital_gains": [capital_gains],
          "capital_losses": [capital_losses]    
      }
  )
  df = encode_df(df)
  
  feature_a = tfdf.keras.FeatureUsage(name="education", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)
  feature_b = tfdf.keras.FeatureUsage(name="sex", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)
  feature_c = tfdf.keras.FeatureUsage(name="capital_gains", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)
  feature_d = tfdf.keras.FeatureUsage(name="capital_losses", semantic=tfdf.keras.FeatureSemantic.CATEGORICAL)
  
  df = tfdf.keras.pd_dataframe_to_tf_dataset(df)
  
  pred = model.predict(df)
  if pred > .5:
    pred_bi = 1
    return {"> $50,000": pred_bi}
  else:
    pred_bi = 0
    return {"<= $50,000": pred_bi}

iface = gr.Interface(
    process_inputs,
    [
     gr.inputs.Dropdown([" 1st 2nd 3rd or 4th grade", " High school graduate", 
                         " Bachelors degree(BA AB BS)", " Masters degree(MA MS MEng MEd MSW MBA)", 
                         " Prof school degree (MD DDS DVM LLB JD)",
                         " Doctorate degree(PhD EdD)"], type="index", label="education"), 
     gr.inputs.Radio([" Male", " Female"], label="Sex (M=0, F=1)", type="index"),
     gr.inputs.Slider(minimum = 0, maximum = 99999, label="capital gains"),
     gr.inputs.Slider(minimum = 0, maximum = 4608, label="capital losses")
    ],
    gr.outputs.Label(num_top_classes=2),
    live=True,
    analytics_enabled=False,
    examples=[
        [" Masters degree(MA MS MEng MEd MSW MBA)", 0, 7298, 0],
        [" Bachelors degree(BA AB BS)", 0, 6514, 0],
    ],
)

iface.launch(debug=True)