# -*- coding: utf-8 -*- """TFDecisionTrees_Final.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1QCdVlNQ8LszC_v3ek10DUeO9V0IvVzpm # Classification with TF Decision Trees Source code from https://keras.io/examples/structured_data/classification_with_tfdf/ """ !pip install huggingface_hub !pip install numpy==1.20 !pip install folium==0.2.1 !pip install imgaug==0.2.6 !pip install tensorflow==2.8.0 !pip install -U tensorflow_decision_forests !pip install ipykernel==4.10 !apt-get install -y git-lfs !pip install wurlitzer from huggingface_hub import notebook_login from huggingface_hub.keras_mixin import push_to_hub_keras notebook_login() import math import urllib import numpy as np import pandas as pd import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers import tensorflow_decision_forests as tfdf import os import tempfile tmpdir = tempfile.mkdtemp() try: from wurlitzer import sys_pipes except: from colabtools.googlelog import CaptureLog as sys_pipes input_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income" input_column_header = "income_level" #Load data BASE_PATH = input_path CSV_HEADER = [ l.decode("utf-8").split(":")[0].replace(" ", "_") for l in urllib.request.urlopen(f"{BASE_PATH}.names") if not l.startswith(b"|")][2:] CSV_HEADER.append(input_column_header) train_data = pd.read_csv(f"{BASE_PATH}.data.gz", header=None, names=CSV_HEADER) test_data = pd.read_csv(f"{BASE_PATH}.test.gz", header=None, names=CSV_HEADER) train_data["migration_code-change_in_msa"] = train_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x) test_data["migration_code-change_in_msa"] = test_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x) print(train_data["migration_code-change_in_msa"].unique()) for i, value in enumerate(CSV_HEADER): if value == "fill_inc_questionnaire_for_veteran's_admin": CSV_HEADER[i] = "fill_inc_veterans_admin" elif value == "migration_code-change_in_msa": CSV_HEADER[i] = "migration_code_chx_in_msa" elif value == "migration_code-change_in_reg": CSV_HEADER[i] = "migration_code_chx_in_reg" elif value == "migration_code-move_within_reg": CSV_HEADER[i] = "migration_code_move_within_reg" #inspect the classes of the label, the input_column_header in this case classes = train_data["income_level"].unique().tolist() print(f"Label classes: {classes}") #rename columns containing invalid characters train_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"}) test_data = test_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"}) #convert from string to integers # This stage is necessary if your classification label is represented as a # string. Note: Keras expected classification labels to be integers. target_labels = [" - 50000.", " 50000+."] train_data[input_column_header] = train_data[input_column_header].map(target_labels.index) test_data[input_column_header] = test_data[input_column_header].map(target_labels.index) #Observe shape of training and test data print(f"Train data shape: {train_data.shape}") print(f"Test data shape: {test_data.shape}") print(train_data.head().T) #define metadata # Target column name. TARGET_COLUMN_NAME = "income_level" # Weight column name. WEIGHT_COLUMN_NAME = "instance_weight" # Numeric feature names. NUMERIC_FEATURE_NAMES = [ "age", "wage_per_hour", "capital_gains", "capital_losses", "dividends_from_stocks", "num_persons_worked_for_employer", "weeks_worked_in_year", ] # Categorical features and their vocabulary lists. CATEGORICAL_FEATURES_WITH_VOCABULARY = { feature_name: sorted( [str(value) for value in list(train_data[feature_name].unique())] ) for feature_name in CSV_HEADER if feature_name not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_COLUMN_NAME]) } # All features names. FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list( CATEGORICAL_FEATURES_WITH_VOCABULARY.keys() ) """Configure hyperparameters for the tree model.""" GROWING_STRATEGY = "BEST_FIRST_GLOBAL" NUM_TREES = 250 MIN_EXAMPLES = 6 MAX_DEPTH = 5 SUBSAMPLE = 0.65 SAMPLING_METHOD = "RANDOM" VALIDATION_RATIO = 0.1 #Implement training & evaluation procedure def prepare_sample(features, target, weight): for feature_name in features: if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY: if features[feature_name].dtype != tf.dtypes.string: # Convert categorical feature values to string. features[feature_name] = tf.strings.as_string(features[feature_name]) return features, target, weight def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None): train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset( train_data, label="income_level", weight="instance_weight" ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE) test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset( test_data, label="income_level", weight="instance_weight" ).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE) model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size) _, accuracy = model.evaluate(test_dataset, verbose=0) push_to_hub = True print(f"Test accuracy: {round(accuracy * 100, 2)}%") #Create model inputs def create_model_inputs(): inputs = {} for feature_name in FEATURE_NAMES: if feature_name in NUMERIC_FEATURE_NAMES: inputs[feature_name] = layers.Input( name=feature_name, shape=(), dtype=tf.float32 ) else: inputs[feature_name] = layers.Input( name=feature_name, shape=(), dtype=tf.string ) return inputs """# Experiment 1: Decision Forests with raw features""" #Decision Forest with raw features def specify_feature_usages(inputs): feature_usages = [] for feature_name in inputs: if inputs[feature_name].dtype == tf.dtypes.float32: feature_usage = tfdf.keras.FeatureUsage( name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL ) else: feature_usage = tfdf.keras.FeatureUsage( name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL ) feature_usages.append(feature_usage) return feature_usages #Create GB trees model def create_gbt_model(): gbt_model = tfdf.keras.GradientBoostedTreesModel( features = specify_feature_usages(create_model_inputs()), exclude_non_specified_features = True, growing_strategy = GROWING_STRATEGY, num_trees = NUM_TREES, max_depth = MAX_DEPTH, min_examples = MIN_EXAMPLES, subsample = SUBSAMPLE, validation_ratio = VALIDATION_RATIO, task = tfdf.keras.Task.CLASSIFICATION, loss = "DEFAULT", ) gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")]) return gbt_model #Train and evaluate model gbt_model = create_gbt_model() run_experiment(gbt_model, train_data, test_data) #Inspect the model: Model type, mask, input features, feature importance print(gbt_model.summary()) inspector = gbt_model.make_inspector() [field for field in dir(inspector) if not field.startswith("_")] #plot the model tfdf.model_plotter.plot_model_in_colab(gbt_model, tree_idx=0, max_depth=3) #display variable importance inspector.variable_importances() print("Model type:", inspector.model_type()) print("Number of trees:", inspector.num_trees()) print("Objective:", inspector.objective()) print("Input features:", inspector.features()) inspector.features() #save_path = os.path.join(tmpdir, "raw/1/") gbt_model.save("/Users/tdubon/TF_Model") """# Creating HF Space""" from huggingface_hub import KerasModelHubMixin from huggingface_hub.keras_mixin import push_to_hub_keras push_to_hub_keras(gbt_model, repo_url="https://huggingface.co/keras-io/TF_Decision_Trees") #Clone and configure !git clone https://tdubon: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@huggingface.co/tdubon/TF_Decision_Trees !cd TFClassificationForest !git config --global user.email "tdubon6@gmail.com" # Tip: using the same email than for your huggingface.co account will link your commits to your profile !git config --global user.name "tdubon" !git add . !git commit -m "Initial commit" !git push tf.keras.models.save_model( gbt_model, "/Users/tdubon/TFClassificationForest", overwrite=True, include_optimizer=True, save_format=None, signatures=None, options=None, save_traces=True) # Commented out IPython magic to ensure Python compatibility. gbt_model.make_inspector().export_to_tensorboard("/tmp/tb_logs/model_1") # %load_ext tensorboard # %tensorboard --logdir "/tmp/tb_logs"