import pandas as pd from .preprocess import load_test_data # Column names. CSV_HEADER = [ "age", "class_of_worker", "detailed_industry_recode", "detailed_occupation_recode", "education", "wage_per_hour", "enroll_in_edu_inst_last_wk", "marital_stat", "major_industry_code", "major_occupation_code", "race", "hispanic_origin", "sex", "member_of_a_labor_union", "reason_for_unemployment", "full_or_part_time_employment_stat", "capital_gains", "capital_losses", "dividends_from_stocks", "tax_filer_stat", "region_of_previous_residence", "state_of_previous_residence", "detailed_household_and_family_stat", "detailed_household_summary_in_household", "instance_weight", "migration_code-change_in_msa", "migration_code-change_in_reg", "migration_code-move_within_reg", "live_in_this_house_1_year_ago", "migration_prev_res_in_sunbelt", "num_persons_worked_for_employer", "family_members_under_18", "country_of_birth_father", "country_of_birth_mother", "country_of_birth_self", "citizenship", "own_business_or_self_employed", "fill_inc_questionnaire_for_veterans_admin", "veterans_benefits", "weeks_worked_in_year", "year", "income_level", ] # Target feature name. TARGET_FEATURE_NAME = "income_level" # Weight column name. WEIGHT_COLUMN_NAME = "instance_weight" # Numeric feature names. NUMERIC_FEATURE_NAMES = [ "age", "wage_per_hour", "capital_gains", "capital_losses", "dividends_from_stocks", "num_persons_worked_for_employer", "weeks_worked_in_year", ] ##Cols which will use "Number" component of gradio for taking user input NUMBER_INPUT_COLS = ['age', 'num_persons_worked_for_employer','weeks_worked_in_year'] test_data = load_test_data() CATEGORICAL_FEATURES_WITH_VOCABULARY = { feature_name: sorted([str(value) for value in list(test_data[feature_name].unique())]) for feature_name in CSV_HEADER if feature_name not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_FEATURE_NAME]) } # All features names. FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list( CATEGORICAL_FEATURES_WITH_VOCABULARY.keys() ) # Feature default values. COLUMN_DEFAULTS = [ [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME] else ["NA"] for feature_name in CSV_HEADER ]