shivi's picture
added final app setup
684811b
raw
history blame
2.6 kB
import tensorflow as tf
import pandas as pd
from .constants import CSV_HEADER, TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME, NUMERIC_FEATURE_NAMES
##Helper functions for preprocessing of data:
def load_test_data():
test_data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.test.gz"
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
return test_data
test_data = load_test_data()
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
feature_name: sorted([str(value) for value in list(test_data[feature_name].unique())])
for feature_name in CSV_HEADER
if feature_name
not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_FEATURE_NAME])
}
# All features names.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
)
# Feature default values.
COLUMN_DEFAULTS = [
[0.0]
if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME]
else ["NA"]
for feature_name in CSV_HEADER
]
def process(features, target):
for feature_name in features:
if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
# Cast categorical feature values to string.
features[feature_name] = tf.cast(features[feature_name], tf.dtypes.string)
# Get the instance weight.
weight = features.pop(WEIGHT_COLUMN_NAME)
return features, target, weight
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
dataset = tf.data.experimental.make_csv_dataset(
csv_file_path,
batch_size=batch_size,
column_names=CSV_HEADER,
column_defaults=COLUMN_DEFAULTS,
label_name=TARGET_FEATURE_NAME,
num_epochs=1,
header=False,
shuffle=shuffle,
).map(process)
return dataset
def create_max_values_map():
max_values_map = {}
for col in NUMERIC_FEATURE_NAMES:
max_val = max(test_data[col])
max_values_map["max_"+col] = max_val
return max_values_map
def create_dropdown_default_values_map():
dropdown_default_values_map = {}
for col in CATEGORICAL_FEATURES_WITH_VOCABULARY.keys():
max_val = test_data[col].max()
dropdown_default_values_map["max_"+col] = max_val
return dropdown_default_values_map
def create_sample_test_data():
test_data["income_level"] = test_data["income_level"].apply(
lambda x: 0 if x == " - 50000." else 1)
sample_df = test_data.loc[:20,:]
sample_df_values = sample_df.values.tolist()
return sample_df_values