tdubon commited on
Commit
17931a1
1 Parent(s): ad71777

Upload tfdecisiontrees_final.py

Browse files
Files changed (1) hide show
  1. tfdecisiontrees_final.py +274 -0
tfdecisiontrees_final.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """TFDecisionTrees_Final.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1QCdVlNQ8LszC_v3ek10DUeO9V0IvVzpm
8
+
9
+ # Classification with TF Decision Trees
10
+ Source code from https://keras.io/examples/structured_data/classification_with_tfdf/
11
+ """
12
+
13
+ !pip install huggingface_hub
14
+
15
+ !pip install numpy==1.20
16
+
17
+ !pip install folium==0.2.1
18
+
19
+ !pip install imgaug==0.2.6
20
+
21
+ !pip install tensorflow==2.8.0
22
+
23
+ !pip install -U tensorflow_decision_forests
24
+
25
+ !pip install ipykernel==4.10
26
+
27
+ !apt-get install -y git-lfs
28
+
29
+ !pip install wurlitzer
30
+
31
+ from huggingface_hub import notebook_login
32
+ from huggingface_hub.keras_mixin import push_to_hub_keras
33
+
34
+ notebook_login()
35
+
36
+ import math
37
+ import urllib
38
+ import numpy as np
39
+ import pandas as pd
40
+ import tensorflow as tf
41
+ from tensorflow import keras
42
+ from tensorflow.keras import layers
43
+ import tensorflow_decision_forests as tfdf
44
+ import os
45
+ import tempfile
46
+
47
+ tmpdir = tempfile.mkdtemp()
48
+
49
+ try:
50
+ from wurlitzer import sys_pipes
51
+ except:
52
+ from colabtools.googlelog import CaptureLog as sys_pipes
53
+
54
+ input_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income"
55
+ input_column_header = "income_level"
56
+
57
+ #Load data
58
+
59
+ BASE_PATH = input_path
60
+ CSV_HEADER = [ l.decode("utf-8").split(":")[0].replace(" ", "_")
61
+ for l in urllib.request.urlopen(f"{BASE_PATH}.names")
62
+ if not l.startswith(b"|")][2:]
63
+
64
+ CSV_HEADER.append(input_column_header)
65
+
66
+ train_data = pd.read_csv(f"{BASE_PATH}.data.gz", header=None, names=CSV_HEADER)
67
+ test_data = pd.read_csv(f"{BASE_PATH}.test.gz", header=None, names=CSV_HEADER)
68
+
69
+ train_data["migration_code-change_in_msa"] = train_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x)
70
+
71
+ test_data["migration_code-change_in_msa"] = test_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x)
72
+
73
+ print(train_data["migration_code-change_in_msa"].unique())
74
+
75
+ for i, value in enumerate(CSV_HEADER):
76
+ if value == "fill_inc_questionnaire_for_veteran's_admin":
77
+ CSV_HEADER[i] = "fill_inc_veterans_admin"
78
+ elif value == "migration_code-change_in_msa":
79
+ CSV_HEADER[i] = "migration_code_chx_in_msa"
80
+ elif value == "migration_code-change_in_reg":
81
+ CSV_HEADER[i] = "migration_code_chx_in_reg"
82
+ elif value == "migration_code-move_within_reg":
83
+ CSV_HEADER[i] = "migration_code_move_within_reg"
84
+
85
+ #inspect the classes of the label, the input_column_header in this case
86
+ classes = train_data["income_level"].unique().tolist()
87
+ print(f"Label classes: {classes}")
88
+
89
+ #rename columns containing invalid characters
90
+ train_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"})
91
+ test_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"})
92
+
93
+ #convert from string to integers
94
+ # This stage is necessary if your classification label is represented as a
95
+ # string. Note: Keras expected classification labels to be integers.
96
+ target_labels = [" - 50000.", " 50000+."]
97
+ train_data[input_column_header] = train_data[input_column_header].map(target_labels.index)
98
+ test_data[input_column_header] = test_data[input_column_header].map(target_labels.index)
99
+
100
+ #Observe shape of training and test data
101
+ print(f"Train data shape: {train_data.shape}")
102
+ print(f"Test data shape: {test_data.shape}")
103
+ print(train_data.head().T)
104
+
105
+ #define metadata
106
+
107
+ # Target column name.
108
+ TARGET_COLUMN_NAME = "income_level"
109
+ # Weight column name.
110
+ WEIGHT_COLUMN_NAME = "instance_weight"
111
+ # Numeric feature names.
112
+ NUMERIC_FEATURE_NAMES = [
113
+ "age",
114
+ "wage_per_hour",
115
+ "capital_gains",
116
+ "capital_losses",
117
+ "dividends_from_stocks",
118
+ "num_persons_worked_for_employer",
119
+ "weeks_worked_in_year",
120
+ ]
121
+
122
+ # Categorical features and their vocabulary lists.
123
+ CATEGORICAL_FEATURES_WITH_VOCABULARY = {
124
+ feature_name: sorted(
125
+ [str(value) for value in list(train_data[feature_name].unique())]
126
+ )
127
+ for feature_name in CSV_HEADER
128
+ if feature_name
129
+ not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_COLUMN_NAME])
130
+ }
131
+ # All features names.
132
+ FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
133
+ CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
134
+ )
135
+
136
+ """Configure hyperparameters for the tree model."""
137
+
138
+ GROWING_STRATEGY = "BEST_FIRST_GLOBAL"
139
+ NUM_TREES = 250
140
+ MIN_EXAMPLES = 6
141
+ MAX_DEPTH = 5
142
+ SUBSAMPLE = 0.65
143
+ SAMPLING_METHOD = "RANDOM"
144
+ VALIDATION_RATIO = 0.1
145
+
146
+ #Implement training & evaluation procedure
147
+ def prepare_sample(features, target, weight):
148
+ for feature_name in features:
149
+ if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
150
+ if features[feature_name].dtype != tf.dtypes.string:
151
+ # Convert categorical feature values to string.
152
+ features[feature_name] = tf.strings.as_string(features[feature_name])
153
+ return features, target, weight
154
+
155
+
156
+ def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None):
157
+
158
+ train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
159
+ train_data, label="income_level", weight="instance_weight"
160
+ ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)
161
+ test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
162
+ test_data, label="income_level", weight="instance_weight"
163
+ ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)
164
+
165
+ model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
166
+ _, accuracy = model.evaluate(test_dataset, verbose=0)
167
+ push_to_hub = True
168
+ print(f"Test accuracy: {round(accuracy * 100, 2)}%")
169
+
170
+ #Create model inputs
171
+
172
+ def create_model_inputs():
173
+ inputs = {}
174
+ for feature_name in FEATURE_NAMES:
175
+ if feature_name in NUMERIC_FEATURE_NAMES:
176
+ inputs[feature_name] = layers.Input(
177
+ name=feature_name, shape=(), dtype=tf.float32
178
+ )
179
+ else:
180
+ inputs[feature_name] = layers.Input(
181
+ name=feature_name, shape=(), dtype=tf.string
182
+ )
183
+ return inputs
184
+
185
+ """# Experiment 1: Decision Forests with raw features"""
186
+
187
+ #Decision Forest with raw features
188
+ def specify_feature_usages(inputs):
189
+ feature_usages = []
190
+
191
+ for feature_name in inputs:
192
+ if inputs[feature_name].dtype == tf.dtypes.float32:
193
+ feature_usage = tfdf.keras.FeatureUsage(
194
+ name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL
195
+ )
196
+ else:
197
+ feature_usage = tfdf.keras.FeatureUsage(
198
+ name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL
199
+ )
200
+
201
+ feature_usages.append(feature_usage)
202
+ return feature_usages
203
+
204
+ #Create GB trees model
205
+ def create_gbt_model():
206
+ gbt_model = tfdf.keras.GradientBoostedTreesModel(
207
+ features = specify_feature_usages(create_model_inputs()),
208
+ exclude_non_specified_features = True,
209
+ growing_strategy = GROWING_STRATEGY,
210
+ num_trees = NUM_TREES,
211
+ max_depth = MAX_DEPTH,
212
+ min_examples = MIN_EXAMPLES,
213
+ subsample = SUBSAMPLE,
214
+ validation_ratio = VALIDATION_RATIO,
215
+ task = tfdf.keras.Task.CLASSIFICATION,
216
+ loss = "DEFAULT",
217
+ )
218
+
219
+ gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
220
+ return gbt_model
221
+
222
+ #Train and evaluate model
223
+ gbt_model = create_gbt_model()
224
+ run_experiment(gbt_model, train_data, test_data)
225
+
226
+ #Inspect the model: Model type, mask, input features, feature importance
227
+ print(gbt_model.summary())
228
+
229
+ inspector = gbt_model.make_inspector()
230
+ [field for field in dir(inspector) if not field.startswith("_")]
231
+
232
+ #plot the model
233
+ tfdf.model_plotter.plot_model_in_colab(gbt_model, tree_idx=0, max_depth=3)
234
+
235
+ #display variable importance
236
+ inspector.variable_importances()
237
+
238
+ print("Model type:", inspector.model_type())
239
+ print("Number of trees:", inspector.num_trees())
240
+ print("Objective:", inspector.objective())
241
+ print("Input features:", inspector.features())
242
+
243
+ inspector.features()
244
+
245
+ #save_path = os.path.join(tmpdir, "raw/1/")
246
+ gbt_model.save("/Users/tdubon/TF_Model")
247
+
248
+ """# Creating HF Space"""
249
+
250
+ from huggingface_hub import KerasModelHubMixin
251
+ from huggingface_hub.keras_mixin import push_to_hub_keras
252
+ push_to_hub_keras(gbt_model, repo_url="https://huggingface.co/keras-io/TF_Decision_Trees")
253
+
254
+ #Clone and configure
255
+ !git clone https://tdubon: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@huggingface.co/tdubon/TF_Decision_Trees
256
+
257
+ !cd TFClassificationForest
258
+ !git config --global user.email "tdubon6@gmail.com"
259
+ # Tip: using the same email than for your huggingface.co account will link your commits to your profile
260
+ !git config --global user.name "tdubon"
261
+
262
+ !git add .
263
+ !git commit -m "Initial commit"
264
+ !git push
265
+
266
+ tf.keras.models.save_model(
267
+ gbt_model, "/Users/tdubon/TFClassificationForest", overwrite=True, include_optimizer=True, save_format=None,
268
+ signatures=None, options=None, save_traces=True)
269
+
270
+ # Commented out IPython magic to ensure Python compatibility.
271
+ gbt_model.make_inspector().export_to_tensorboard("/tmp/tb_logs/model_1")
272
+
273
+ # %load_ext tensorboard
274
+ # %tensorboard --logdir "/tmp/tb_logs"