import tensorflow as tf import pandas as pd from .constants import CSV_HEADER, TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME, NUMERIC_FEATURE_NAMES ##Helper functions for preprocessing of data: def load_test_data(): test_data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.test.gz" test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER) return test_data test_data = load_test_data() CATEGORICAL_FEATURES_WITH_VOCABULARY = { feature_name: sorted([str(value) for value in list(test_data[feature_name].unique())]) for feature_name in CSV_HEADER if feature_name not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_FEATURE_NAME]) } # All features names. FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list( CATEGORICAL_FEATURES_WITH_VOCABULARY.keys() ) # Feature default values. COLUMN_DEFAULTS = [ [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME] else ["NA"] for feature_name in CSV_HEADER ] def process(features, target): for feature_name in features: if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY: # Cast categorical feature values to string. features[feature_name] = tf.cast(features[feature_name], tf.dtypes.string) # Get the instance weight. weight = features.pop(WEIGHT_COLUMN_NAME) return features, target, weight def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128): dataset = tf.data.experimental.make_csv_dataset( csv_file_path, batch_size=batch_size, column_names=CSV_HEADER, column_defaults=COLUMN_DEFAULTS, label_name=TARGET_FEATURE_NAME, num_epochs=1, header=False, shuffle=shuffle, ).map(process) return dataset def create_max_values_map(): max_values_map = {} for col in NUMERIC_FEATURE_NAMES: max_val = max(test_data[col]) max_values_map["max_"+col] = max_val return max_values_map def create_dropdown_default_values_map(): dropdown_default_values_map = {} for col in CATEGORICAL_FEATURES_WITH_VOCABULARY.keys(): max_val = test_data[col].max() dropdown_default_values_map["max_"+col] = max_val return dropdown_default_values_map def create_sample_test_data(): test_data["income_level"] = test_data["income_level"].apply( lambda x: 0 if x == " - 50000." else 1) sample_df = test_data.loc[:20,:] sample_df_values = sample_df.values.tolist() return sample_df_values