Spaces:

manivannan023
/

in_pr_test

Running

App Files Files Community

manivannan023 commited on Mar 22

Commit

4a58702

•

1 Parent(s): 3ee4302

Upload 83 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +8 -0
api.py +69 -0
api_requirements.txt +10 -0
clustering/__init__.py +0 -0
clustering/__pycache__/__init__.cpython-37.pyc +0 -0
clustering/__pycache__/clustering.cpython-37.pyc +0 -0
clustering/clustering.py +66 -0
data_cleaning/__init__.py +0 -0
data_cleaning/__pycache__/__init__.cpython-37.pyc +0 -0
data_cleaning/__pycache__/data_cleaning.cpython-37.pyc +0 -0
data_cleaning/data_cleaning.py +54 -0
data_preprocessing/__init__.py +0 -0
data_preprocessing/__pycache__/__init__.cpython-37.pyc +0 -0
data_preprocessing/__pycache__/categorical_id_var_deletor.cpython-37.pyc +0 -0
data_preprocessing/__pycache__/categorical_imputer.cpython-37.pyc +0 -0
data_preprocessing/__pycache__/categorical_variance_threshold.cpython-37.pyc +0 -0
data_preprocessing/__pycache__/multicollinearity_handler.cpython-37.pyc +0 -0
data_preprocessing/__pycache__/numeric_imputer.cpython-37.pyc +0 -0
data_preprocessing/__pycache__/outlier_handler.cpython-37.pyc +0 -0
data_preprocessing/__pycache__/rare_category_encoder.cpython-37.pyc +0 -0
data_preprocessing/categorical_id_var_deletor.py +47 -0
data_preprocessing/categorical_imputer.py +50 -0
data_preprocessing/categorical_variance_threshold.py +48 -0
data_preprocessing/multicollinearity_handler.py +53 -0
data_preprocessing/numeric_imputer.py +71 -0
data_preprocessing/outlier_handler.py +68 -0
data_preprocessing/rare_category_encoder.py +60 -0
data_validation/__init__.py +0 -0
data_validation/__pycache__/__init__.cpython-37.pyc +0 -0
data_validation/__pycache__/data_validation.cpython-37.pyc +0 -0
data_validation/data_validation.py +62 -0
data_validation/input_data_specs.json +71 -0
feature_construction/__init__.py +0 -0
feature_construction/__pycache__/__init__.cpython-37.pyc +0 -0
feature_construction/__pycache__/feature_construction.cpython-37.pyc +0 -0
feature_construction/feature_construction.py +70 -0
feature_selection/__init__.py +0 -0
feature_selection/__pycache__/__init__.cpython-37.pyc +0 -0
feature_selection/__pycache__/feature_selection.cpython-37.pyc +0 -0
feature_selection/feature_selection.py +143 -0
logger/__init__.py +0 -0
logger/__pycache__/__init__.cpython-37.pyc +0 -0
logger/__pycache__/logger.cpython-37.pyc +0 -0
logger/logger.py +50 -0
model_inference/__init__.py +0 -0
model_inference/__pycache__/__init__.cpython-37.pyc +0 -0
model_inference/__pycache__/model_inference.cpython-37.pyc +0 -0
model_inference/model_inference.py +78 -0
models/deployed_model.json +1 -0
models/model_cold_2022_11_22/feature_importances.csv +36 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,8 @@

+FROM python:3.7-slim
+WORKDIR /api
+COPY . .
+COPY ./docker/api_requirements.txt ./requirements.txt
+RUN rm -r ./docker
+RUN pip install --no-cache-dir -r requirements.txt
+CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "5001", "--workers", "4"]
+EXPOSE 5001

api.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from fastapi import FastAPI
+import uvicorn
+from pydantic import BaseModel
+import pandas as pd
+from data_validation.data_validation import DataValidation
+from data_cleaning.data_cleaning import DataCleaning
+from model_inference.model_inference import predict
+from logger.logger import MongoLogger
+import traceback
+app = FastAPI()
+class Data(BaseModel):
+    """
+    Data dictionary for data type validation
+    """
+    age: int
+    workclass: str
+    fnlwgt: int
+    education: str
+    education_num: int
+    marital_status: str
+    occupation: str
+    relationship: str
+    race: str
+    sex: str
+    capital_gain: int
+    capital_loss: int
+    hours_per_week: int
+    country: str
+@app.post("/")
+def prediction(data: Data):
+    """
+    Processes the API request and returns a prediction
+    """
+    logger = MongoLogger()
+    logger.log_to_db(level="INFO", message="entering prediction_api")
+    try:
+        df = pd.DataFrame(data.dict(), index=[0])  # converting api data dict to df
+        dv = DataValidation(input_df=df, dataset="prediction")  # validating the data
+        validation_status = dv.validate_data()  # status of validation. 1=passed, 0=failed
+        if validation_status != 0:
+            data_cleaning = DataCleaning()
+            # cleaning the data
+            df = data_cleaning.clean_column_names(df).copy()
+            df = data_cleaning.shorten_column_names(df).copy()
+            df = data_cleaning.clean_nan(df).copy()
+            # calling the 'model_inference.model_inference.predict' function
+            pred = predict(df, predict_proba=False, predict_label=True)[0].strip()
+        else:
+            # executes when data validation fails
+            pred = "data validation failed"
+    except Exception as e:
+        # executes in case of any exception
+        pred = e
+        logger.log_to_db(level="CRITICAL", message=f"unexpected error in prediction_api: {traceback.format_exc()}")
+        raise
+    logger.log_to_db(level="INFO", message="exiting prediction_api")
+    return {"result": pred}
+if __name__ == '__main__':
+    uvicorn.run(app=app, host='0.0.0.0', port=5001, workers=4)

api_requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+scikit-learn==1.0.2
+xgboost==1.6.2
+fastapi==0.85.2
+pydantic==1.10.2
+uvicorn==0.19.0
+pymongo==4.3.3
+pandas==1.3.5
+cloudpickle==2.2.0
+hyperopt==0.2.7
+imblearn==0.0

clustering/__init__.py ADDED Viewed

File without changes

clustering/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (162 Bytes). View file

clustering/__pycache__/clustering.cpython-37.pyc ADDED Viewed

Binary file (3.03 kB). View file

clustering/clustering.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
+from sklearn.metrics import silhouette_score
+class AutoCluster:
+    """
+    Returns clusters by automatically evaluating 'k' using silhouette_score.
+    'k' range = [min_cluster, max_cluster]
+    """
+    def __init__(self, min_cluster: int = 2, max_cluster: int = 10, random_state: int = 42):
+        self.__scaler = None
+        self.__ohe = None
+        self.k = None
+        self.min_cluster = min_cluster
+        self.max_cluster = max_cluster
+        self.kmeans_model = None
+        self.random_state = random_state
+    def __fit_scaler(self, x: pd.DataFrame):
+        x = x.copy()
+        self.__scaler = MinMaxScaler()
+        self.__scaler.fit(x)
+    def __find_best_k(self, x: pd.DataFrame):
+        x = x.copy()
+        self.__fit_scaler(x)
+        x_scaled = self.__scaler.transform(x)
+        silhouette_scores = []
+        for k in range(self.min_cluster, self.max_cluster + 1):
+            kmeans = KMeans(n_clusters=k, random_state=self.random_state)
+            kmeans.fit(x_scaled)
+            labels = kmeans.labels_
+            silhouette_scores.append(silhouette_score(X=x_scaled, labels=labels, random_state=self.random_state))
+        self.k = self.min_cluster + np.argmax(silhouette_scores)
+    def __fit_one_hot_encoder(self, x: pd.DataFrame):
+        self.__ohe = OneHotEncoder(sparse=False)
+        self.__ohe.fit(x)
+    def fit(self, x: pd.DataFrame):
+        x = x.copy()
+        self.__find_best_k(x)
+        self.kmeans_model = KMeans(n_clusters=self.k, random_state=self.random_state)
+        x_scaled = self.__scaler.transform(x)
+        self.kmeans_model.fit(x_scaled)
+        prediction_df = pd.DataFrame({'cluster': self.kmeans_model.predict(x_scaled)})
+        self.__fit_one_hot_encoder(prediction_df)
+        return self
+    def predict(self, x: pd.DataFrame):
+        x = x.copy()
+        x_scaled = self.__scaler.transform(x)
+        prediction = self.kmeans_model.predict(x_scaled)
+        prediction_df = pd.DataFrame({'cluster': prediction})
+        prediction_ohe = pd.DataFrame(self.__ohe.transform(prediction_df), columns=self.__ohe.get_feature_names_out())
+        return prediction_ohe, prediction
+    def fit_predict(self, x: pd.DataFrame):
+        self.fit(x)
+        self.predict(x)
+    def __repr__(self):
+        return f"AutoCluster(min_cluster={self.min_cluster}, max_cluster={self.max_cluster}, random_state=42)"

data_cleaning/__init__.py ADDED Viewed

File without changes

data_cleaning/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (165 Bytes). View file

data_cleaning/__pycache__/data_cleaning.cpython-37.pyc ADDED Viewed

Binary file (2.88 kB). View file

data_cleaning/data_cleaning.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import pandas as pd
+import re
+import numpy as np
+from logger.logger import MongoLogger
+class DataCleaning:
+    def __init__(self):
+        self.logger = MongoLogger()
+    def clean_column_names(self, input_df: pd.DataFrame):
+        """
+        Replaces special characters in column names with underscore. Also converts
+        column names into lowercase
+        """
+        self.logger.log_to_db(level="INFO", message="entering data_cleaning.clean_column_names")
+        try:
+            df = input_df.copy()
+            clean_col_names = [re.sub(r"[\.\?\s]", "_", col_name.lower().strip()) for col_name in df.columns]
+            df.columns = clean_col_names
+        except Exception as e:
+            self.logger.log_to_db(level="CRITICAL", message=f"unexpected  data_cleaning.clean_column_names error: {e}")
+            raise
+        self.logger.log_to_db(level="INFO", message="exiting data_cleaning.clean_column_names")
+        return df
+    def shorten_column_names(self, input_df: pd.DataFrame, max_len: int = 25):
+        """
+        Shortens the  column names to a specified length
+        """
+        self.logger.log_to_db(level="INFO", message="entering data_cleaning.shorten_column_names")
+        try:
+            df = input_df.copy()
+            short_col_names = [col_name[:max_len] for col_name in df.columns]
+            df.columns = short_col_names
+        except Exception as e:
+            self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.shorten_column_names error: {e}")
+            raise
+        self.logger.log_to_db(level="INFO", message="exiting data_cleaning.shorten_column_names")
+        return df
+    def clean_nan(self, input_df: pd.DataFrame, to_replace: list = [' ?', '?', '-', '_', -1, "-1"]):
+        """
+        Replaces special characters and 0s in data with NaN
+        """
+        self.logger.log_to_db(level="INFO", message="entering data_cleaning.clean_nan")
+        try:
+            df = input_df.copy()
+            df.replace(to_replace=to_replace, value=np.nan, inplace=True, regex=False)
+        except Exception as e:
+            self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.clean_nan error: {e}")
+            raise
+        self.logger.log_to_db(level="INFO", message="exiting data_cleaning.clean_nan")
+        return df

data_preprocessing/__init__.py ADDED Viewed

File without changes

data_preprocessing/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (170 Bytes). View file

data_preprocessing/__pycache__/categorical_id_var_deletor.cpython-37.pyc ADDED Viewed

Binary file (2.52 kB). View file

data_preprocessing/__pycache__/categorical_imputer.cpython-37.pyc ADDED Viewed

Binary file (2.7 kB). View file

data_preprocessing/__pycache__/categorical_variance_threshold.cpython-37.pyc ADDED Viewed

Binary file (2.71 kB). View file

data_preprocessing/__pycache__/multicollinearity_handler.cpython-37.pyc ADDED Viewed

Binary file (2.68 kB). View file

data_preprocessing/__pycache__/numeric_imputer.cpython-37.pyc ADDED Viewed

Binary file (3.17 kB). View file

data_preprocessing/__pycache__/outlier_handler.cpython-37.pyc ADDED Viewed

Binary file (2.77 kB). View file

data_preprocessing/__pycache__/rare_category_encoder.cpython-37.pyc ADDED Viewed

Binary file (2.92 kB). View file

data_preprocessing/categorical_id_var_deletor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from logger.logger import MongoLogger
+class CategoricalIdVarDeletor(BaseEstimator, TransformerMixin):
+    """
+        removes columns with % of unique cats >= threshold.
+        if threshold=1 i.e. 100% categories are unique. so it's removed
+        Doesn't Remove cols with all NAs
+        This considers NA as separate category
+        """
+    def __init__(self, threshold: float = 1):
+        self.threshold = threshold
+        self.selected_features = []
+    @staticmethod
+    def get_unique_cat_percent(x: pd.Series, n_rows: int):
+        x = x.copy()
+        return len(x.fillna("!@#$%This value is missing^&*(").unique()) / n_rows
+    def fit(self, x: pd.DataFrame):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering categorical_id_var_deletor.fit")
+            x = x.copy()
+            unique_cat_percent = x.apply(self.get_unique_cat_percent, n_rows=x.shape[0])
+            self.selected_features = [*unique_cat_percent[unique_cat_percent < self.threshold].index]
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_id_var_deletor.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting categorical_id_var_deletor.fit")
+        return self
+    def transform(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering categorical_id_var_deletor.transform")
+            x = x.copy()
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_id_var_deletor.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting categorical_id_var_deletor.transform")
+        return x[self.selected_features]
+    def get_feature_names_out(self, input_features=None):
+        return self.selected_features

data_preprocessing/categorical_imputer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from logger.logger import MongoLogger
+class CategoricalImputer(BaseEstimator, TransformerMixin):
+    """
+    Imputes missing values in categorical features.
+    strategy:
+        - "most_frequent": imputes most frequent value (mode).
+        - "constant": imputes a constant named "column_name_missing".
+    """
+    def __init__(self, strategy="most_frequent"):
+        self.strategy = strategy
+        self.fill_values = None
+        self.feature_names = []
+    def find_fill_values(self, x):
+        if self.strategy == 'constant':
+            self.fill_values = [f'{column}_missing' for column in [*x.columns]]
+        else:
+            self.fill_values = [*x.apply(lambda column: [*column.value_counts().index][0])]
+    def fit(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering categorical_imputer.fit")
+            self.feature_names = [*x.columns]
+            self.find_fill_values(x)
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_imputer.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting categorical_imputer.fit")
+        return self
+    def transform(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering categorical_imputer.transform")
+            x = x.copy()
+            for i, column in enumerate([*x.columns]):
+                x[column] = x[column].fillna(self.fill_values[i])
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_imputer.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting categorical_imputer.transform")
+        return x
+    def get_feature_names_out(self, input_features=None):
+        return self.feature_names

data_preprocessing/categorical_variance_threshold.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from logger.logger import MongoLogger
+class CategoricalVarianceThreshold(BaseEstimator, TransformerMixin):
+    """
+    removes columns where top category value_count is > threshold.
+    Removes cols with all NAs
+    This considers NA as separate category
+    """
+    def __init__(self, threshold: float = 0.99):
+        self.threshold = threshold
+        self.selected_features = []
+    @staticmethod
+    def __get_max_category_proportions(x: pd.DataFrame):
+        x = x.copy()
+        return x.apply(lambda col: col.fillna("!@#$%This value is missing^&*(").value_counts(normalize=True).values[0])
+    def fit(self, x, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering categorical_variance_threshold.fit")
+            x = x.copy()
+            max_category_proportions = self.__get_max_category_proportions(x)
+            self.selected_features = [*max_category_proportions[max_category_proportions < self.threshold].index]
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_variance_threshold.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting categorical_variance_threshold.fit")
+        return self
+    def transform(self, x, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering categorical_variance_threshold.transform")
+            x = x.copy()
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL",
+                             message=f"unexpected categorical_variance_threshold.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting categorical_variance_threshold.transform")
+        return x[self.selected_features]
+    def get_feature_names_out(self, input_features=None):
+        return self.selected_features

data_preprocessing/multicollinearity_handler.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import pandas as pd
+import numpy as np
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from sklearn.base import BaseEstimator, TransformerMixin
+import warnings
+from logger.logger import MongoLogger
+warnings.filterwarnings("ignore")
+class MulticollinearityHandler(BaseEstimator, TransformerMixin):
+    """
+    Removes numeric variables having VIF > threshold
+    """
+    def __init__(self, threshold: float = 10):
+        self.threshold = threshold
+        self.selected_features = []
+    @staticmethod
+    def get_vif(x: pd.DataFrame):
+        x = x.copy()
+        vif = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]  # calculate VIF
+        return vif
+    def fit(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering multicollinearity_handler.fit")
+            x = x.copy()
+            vif = self.get_vif(x)
+            while np.max(vif) > self.threshold:  # running the loop while vif > threshold
+                max_vif_column = x.columns[np.argmax(vif)]  # idx with max vif
+                x = x.drop(columns=[max_vif_column]).copy()  # drop column with max vif one at a time
+                vif = self.get_vif(x)
+            self.selected_features = [*x.columns]  # storing remainder columns i.e. cols without multicollinearity
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected multicollinearity_handler.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting multicollinearity_handler.fit")
+        return self
+    def transform(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering multicollinearity_handle.transform")
+            x = x.copy()
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected multicollinearity_handle.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting multicollinearity_handle.transform")
+        return x[self.selected_features]
+    def get_feature_names_out(self, input_features=None):
+        return self.selected_features

data_preprocessing/numeric_imputer.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from logger.logger import MongoLogger
+class NumericImputer(BaseEstimator, TransformerMixin):
+    """
+    Imputes missing values in numeric features.
+    strategy:
+        - "mean": imputes mean.
+        - "median": imputes median.
+        - "lower bound": imputes lower bound of IQR method for outlier detection.
+        - "upper bound": imputes upper bound of IQR method for outlier detection.
+    Using the bound factor, lower/upper bounds can be controlled. A higher bound_factor will result in
+    end-of-distribution imputation. It is equivalent to adding missing category in categorical imputation.
+    """
+    def __init__(self, strategy: str = "median", bound_factor: float = 1.5):
+        self.strategy = strategy
+        self.bound_factor = bound_factor
+        self.fill_values = []
+        self.feature_names = []
+    def find_bounds(self, x: pd.Series):
+        """
+        Find lower/upper bounds using IQR method for outlier detection.
+        """
+        q1 = x.quantile(0.25)
+        q3 = x.quantile(0.75)
+        iqr = q3 - q1
+        if self.strategy == "lower_bound":
+            bound = q1 - (self.bound_factor * iqr)
+        else:
+            bound = q3 + (self.bound_factor * iqr)
+        return bound
+    def find_fill_values(self, x: pd.Series):
+        if self.strategy == 'mean':
+            self.fill_values.append(np.mean(x.dropna()))
+        elif self.strategy == 'median':
+            self.fill_values.append(np.median(x.dropna()))
+        else:
+            self.fill_values.append(self.find_bounds(x.dropna()))
+    def fit(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering numeric_imputer.fit")
+            self.feature_names = [*x.columns]
+            x.apply(self.find_fill_values)
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected numeric_imputer.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting numeric_imputer.fit")
+        return self
+    def transform(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering numeric_imputer.transform")
+            x = x.copy()
+            for i, column in enumerate([*x.columns]):
+                x[column] = x[column].fillna(self.fill_values[i])
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected numeric_imputer.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting numeric_imputer.transform")
+        return x
+    def get_feature_names_out(self, input_features=None):
+        return self.feature_names

data_preprocessing/outlier_handler.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from logger.logger import MongoLogger
+class OutlierHandler(BaseEstimator, TransformerMixin):
+    """
+    This transform detects outliers using IQR method. Then the outliers
+    are either replaced with NaN or with lower or upper bounds computed
+    using IQR method i.e. they are winsorized.
+    """
+    def __init__(self, method: str = 'winsorize', factor: float = 1.5):
+        self.method = method
+        self.factor = factor
+        self.lower_bounds = []
+        self.upper_bounds = []
+        self.feature_names = []
+    def detect_bounds(self, x: pd.Series):
+        """
+        Method to detect the lower and upper bounds using IQR method
+        """
+        x = x.copy()
+        q1 = x.quantile(0.25)
+        q3 = x.quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - (self.factor * iqr)
+        upper_bound = q3 + (self.factor * iqr)
+        self.lower_bounds.append(lower_bound)
+        self.upper_bounds.append(upper_bound)
+    def fit(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering outlier_handler.fit")
+            x = x.copy()
+            self.feature_names = [*x.columns]
+            x.apply(self.detect_bounds)
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected outlier_handler.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting outlier_handler.fit")
+        return self
+    def transform(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering outlier_handler.transform")
+            x = x.copy()
+            for i, column in enumerate(x.columns):
+                lower_bound = self.lower_bounds[i]
+                upper_bound = self.upper_bounds[i]
+                lower_repl = np.nan
+                upper_repl = np.nan
+                if self.method == 'winsorize':
+                    lower_repl = lower_bound
+                    upper_repl = upper_bound
+                x.loc[(x[column] < lower_bound), column] = lower_repl
+                x.loc[(x[column] > upper_bound), column] = upper_repl
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected outlier_handler.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting outlier_handler.transform")
+        return x
+    def get_feature_names_out(self, input_features=None):
+        return self.feature_names

data_preprocessing/rare_category_encoder.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from logger.logger import MongoLogger
+class RareCategoryEncoder(BaseEstimator, TransformerMixin):
+    """
+    replaces rare categories with rare_category.
+    Any unknown category in test set is also replaced with rare_category.
+    stores the frequent categories.
+    NAs are ignored (as a col with single value 'a' and remaining NAs). 'a' can't be considered rare
+    as it is the only value available, it can't be rare.
+    This method doesn't touch NAs. They are left as they are.
+    """
+    def __init__(self, threshold: float = 0.05, replace_value: str = 'rare_category'):
+        self.threshold = threshold
+        self.replace_value = replace_value
+        self.feature_names = []
+        self.frequent_cat_list = []
+    def __frequent_category_detector(self, x: pd.Series, y=None):
+        x = x.copy()
+        val_counts = x.value_counts(normalize=True)
+        # frequent categories in a column are the ones whose frequency > threshold
+        frequent_cats = [*val_counts[val_counts > self.threshold].index]
+        self.frequent_cat_list.append(frequent_cats)
+    def fit(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering rare_category_encoder.fit")
+            x = x.copy()
+            self.feature_names = [*x.columns]
+            x.apply(self.__frequent_category_detector)
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected rare_category_encoder.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting rare_category_encoder.fit")
+        return self
+    def transform(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering rare_category_encoder.transform")
+            x = x.copy()
+            for i in range(x.shape[1]):
+                x_ser = x.iloc[:, i].copy()
+                # replacing categories in each column, not in frequent list and not NANs with replace_value
+                x_ser[(x_ser.isin(self.frequent_cat_list[i]) == False) &
+                      (x_ser.isna() == False) &
+                      (x_ser.isnull() == False)] = self.replace_value
+                x.iloc[:, i] = x_ser
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected rare_category_encoder.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting rare_category_encoder.transform")
+        return x
+    def get_feature_names_out(self, input_features=None):
+        return self.feature_names

data_validation/__init__.py ADDED Viewed

File without changes

data_validation/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (167 Bytes). View file

data_validation/__pycache__/data_validation.cpython-37.pyc ADDED Viewed

Binary file (2.27 kB). View file

data_validation/data_validation.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from utils.json_parser import JSONParser
+import pandas as pd
+import os
+from logger.logger import MongoLogger
+class DataValidation:
+    """
+    - Class to validate input data as per the data dictionary. It validates the column count,
+    column data types and column names.
+    - Returns 1 if data is valid else 0
+    - dataset: str
+        - train: for train set
+        - test: for test set
+        - prediction: for single sample inference / batch inference
+    """
+    def __init__(self, input_df: pd.DataFrame, dataset: str = "train"):
+        self.input_df = input_df
+        self.dataset = dataset
+    def validate_data(self):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="Entering data_validation")
+            status = 1
+            # parsing input data specification JSON
+            json_parser = JSONParser(os.path.join('.', "data_validation", 'input_data_specs.json'))
+            input_data_specs_dict = json_parser.parse_json()
+            # column specs for train and test data
+            column_count_key = 'train_test_column_count'
+            column_name_key = 'train_test_column_names'
+            column_types_key = 'train_test_column_dtypes'
+            # column specs for prediction data. Prediction data shouldn't have target (salary) column
+            if self.dataset == "prediction":
+                column_count_key = 'prediction_column_count'
+                column_name_key = 'prediction_column_names'
+                column_types_key = 'prediction_column_dtypes'
+            n_cols = input_data_specs_dict[column_count_key]
+            col_names = input_data_specs_dict[column_name_key]
+            col_dtypes = input_data_specs_dict[column_types_key]
+            if len(self.input_df.columns) != n_cols:
+                status = 0
+                logger.log_to_db(level="CRITICAL",
+                                 message=f"{self.dataset} data_validation failed: column count doesn't match")
+            if col_names != [*self.input_df.columns]:
+                status = 0
+                logger.log_to_db(level="CRITICAL",
+                                 message=f"{self.dataset} data_validation failed: column names don't match")
+            if col_dtypes != self.input_df.dtypes.tolist():
+                status = 0
+                logger.log_to_db(level="CRITICAL",
+                                 message=f"{self.dataset} data_validation failed: column dtypes don't match")
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected data_validation error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting data_validation")
+        return status

data_validation/input_data_specs.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "train_test_column_count": 15,
+  "train_test_column_names": [
+    "age",
+    "workclass",
+    "fnlwgt",
+    "education",
+    "education_num",
+    "marital_status",
+    "occupation",
+    "relationship",
+    "race",
+    "sex",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+    "country",
+    "salary"
+  ],
+  "train_test_column_dtypes": [
+    "int64",
+    "O",
+    "int64",
+    "O",
+    "int64",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "int64",
+    "int64",
+    "int64",
+    "O",
+    "O"
+  ],
+  "prediction_column_count": 14,
+  "prediction_column_names": [
+    "age",
+    "workclass",
+    "fnlwgt",
+    "education",
+    "education_num",
+    "marital_status",
+    "occupation",
+    "relationship",
+    "race",
+    "sex",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+    "country"
+  ],
+  "prediction_column_dtypes": [
+    "int64",
+    "O",
+    "int64",
+    "O",
+    "int64",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "int64",
+    "int64",
+    "int64",
+    "O"
+  ]
+}

feature_construction/__init__.py ADDED Viewed

File without changes

feature_construction/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (172 Bytes). View file

feature_construction/__pycache__/feature_construction.cpython-37.pyc ADDED Viewed

Binary file (1.71 kB). View file

feature_construction/feature_construction.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import pandas as pd
+from logger.logger import MongoLogger
+class FeatureConstructor:
+    """
+    Adds new engineered features to input data
+    """
+    @staticmethod
+    def add_features(x: pd.DataFrame):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering add_features")
+            x = x.copy()
+            education_group = {' 5th-6th': 'middle_school',
+                               ' 7th-8th': 'middle_school',
+                               ' 9th': 'middle_school',
+                               ' 10th': 'high_school',
+                               ' 11th': 'high_school',
+                               ' 12th': 'high_school',
+                               ' HS-grad': 'hs_grad',
+                               ' Prof-school': 'high_school',
+                               ' Some-college': 'college',
+                               ' Masters': 'college',
+                               ' Bachelors': 'college',
+                               ' 1st-4th': 'primary_school',
+                               ' Preschool': 'primary_school',
+                               ' Assoc-voc': 'college',
+                               ' Assoc-acdm': 'college',
+                               ' Doctorate': 'doctorate'}
+            x['education_group'] = x['education'].map(education_group)
+            # workclass group has ver low mutual info score so removing it
+            # workclass_group = {' Federal-gov': 'government',
+            #                    ' Local-gov': 'government',
+            #                    ' State-gov': 'government',
+            #                    ' Private': 'private',
+            #                    ' Self-emp-inc': 'self_emp',
+            #                    ' Self-emp-not-inc': 'self_emp',
+            #                    ' Never-worked': 'no_work',
+            #                    ' Without-pay': 'no_work'}
+            # x['workclass_group'] = x['workclass'].map(workclass_group)
+            is_single = {' Divorced': 1,
+                               ' Married-spouse-absent': 1,
+                               ' Never-married': 1,
+                               ' Separated': 1,
+                               ' Widowed': 1,
+                               ' Married-AF-spouse': 0,
+                               ' Married-civ-spouse': 0}
+            x['is_single'] = x['marital_status'].map(is_single)
+            x['relationship_marital'] = x['relationship'] + x['marital_status']
+            # x.loc[(x['capital_gain'] > 5000), 'capital_gain'] = 5000
+            x['has_capital_gain'] = (x['capital_gain'] > 0).astype("int")
+            # x['has_capital_loss'] = (x['capital_loss'] > 0).astype("int")
+            x['age_of_first_edu'] = x['age'] - x['education_num']
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected add_features error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting add_features")
+        return x

feature_selection/__init__.py ADDED Viewed

File without changes

feature_selection/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (169 Bytes). View file

feature_selection/__pycache__/feature_selection.cpython-37.pyc ADDED Viewed

Binary file (5.42 kB). View file

feature_selection/feature_selection.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import pandas as pd
+import numpy as np
+from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from sklearn.metrics import roc_auc_score
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.model_selection import StratifiedKFold
+from utils.find_class_weights import find_class_weights
+from sklearn.utils import class_weight
+from logger.logger import MongoLogger
+class FeatureSelector(BaseEstimator, TransformerMixin):
+    """
+    - Feature selector selects n_features by training RandomForestClassifier and XGBClassifier.
+    - It uses hyperopt hyperparameter tuning to choose the best model from the two.
+    - It uses feature importance of the best model to select features.
+    """
+    def __init__(self, n_features: int, n_trials: int = 10, cv_splits: int = 2):
+        self.n_features = int(n_features)
+        self.n_trials = n_trials
+        self.cv_splits = cv_splits
+        self.selected_features = []
+        self.feature_importance = None
+    def __find_best_model(self, x: pd.DataFrame, y: pd.Series):
+        x = x.reset_index(drop=True).copy()
+        y = y.reset_index(drop=True).copy()
+        classifier_params = ['random_forest', 'xgboost']
+        max_depth_params = [3, 4, 5, 6, 7, 8, 9, 10]
+        n_estimators_params = [50, 100, 200, 300, 500]
+        eta_params = [0.1, 0.3, 0.01, 0.001, 0.0001, 1]
+        search_space = {'classifier': hp.choice('classifier', [
+            {
+                'type': classifier_params[0],
+                'max_depth': hp.choice('rf_max_depth', max_depth_params),
+                'n_estimators': hp.choice('rf_n_estimators', n_estimators_params)
+            },
+            {
+                'type': classifier_params[1],
+                'max_depth': hp.choice('xgb_max_depth', max_depth_params),
+                'n_estimators': hp.choice('xgb_n_estimators', n_estimators_params),
+                'eta': hp.choice('xgb_eta', eta_params)
+            }
+        ])}
+        def objective(params):
+            classifier = None
+            max_depth = params['classifier']['max_depth']
+            n_estimators = params['classifier']['n_estimators']
+            train_scores = []
+            val_scores = []
+            skfold = StratifiedKFold(n_splits=self.cv_splits)
+            for train_idx, test_idx in skfold.split(x, y):
+                if params['classifier']['type'] == classifier_params[0]:  # rf
+                    class_weight_params = find_class_weights(y=y[train_idx])
+                    classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=42,
+                                                        class_weight=class_weight_params)
+                    classifier.fit(x.iloc[train_idx], y[train_idx])
+                if params['classifier']['type'] == classifier_params[1]:  # xgboost
+                    eta = params['classifier']['eta']
+                    sample_weight_params = class_weight.compute_sample_weight(class_weight="balanced", y=y[train_idx])
+                    classifier = XGBClassifier(n_estimators=n_estimators, eta=eta,
+                                               max_depth=max_depth, random_state=42, verbosity=0)
+                    classifier.fit(x.iloc[train_idx], y[train_idx], sample_weight=sample_weight_params)
+                train_scores.append(roc_auc_score(y[train_idx],
+                                                  classifier.predict_proba(x.iloc[train_idx])[:, 1]))
+                val_scores.append(roc_auc_score(y[test_idx],
+                                                classifier.predict_proba(x.iloc[test_idx])[:, 1]))
+            avg_train_score = np.mean(train_scores)
+            avg_val_score = np.mean(val_scores)
+            return {'loss': -avg_val_score, 'train_score': avg_train_score, 'val_score': avg_val_score,
+                    'status': STATUS_OK}
+        model_trials = Trials()
+        model_best = fmin(
+            fn=objective,
+            space=search_space,
+            algo=tpe.suggest,
+            max_evals=self.n_trials,
+            catch_eval_exceptions=False,
+            verbose=False,
+            trials=model_trials
+        )
+        best_classifier_name = classifier_params[model_best['classifier']]
+        best_classifier_model = None
+        if best_classifier_name == classifier_params[0]:  # rf
+            class_weight_params = find_class_weights(y=y)
+            best_max_depth = max_depth_params[model_best['rf_max_depth']]
+            best_n_estimators = n_estimators_params[model_best['rf_n_estimators']]
+            best_classifier_model = RandomForestClassifier(max_depth=best_max_depth,
+                                                           n_estimators=best_n_estimators,
+                                                           random_state=42,
+                                                           class_weight=class_weight_params)
+        if best_classifier_name == classifier_params[1]:  # xgboost
+            best_n_estimators = n_estimators_params[model_best['xgb_n_estimators']]
+            best_max_depth = max_depth_params[model_best['xgb_max_depth']]
+            best_eta = eta_params[model_best['xgb_eta']]
+            best_classifier_model = XGBClassifier(n_estimators=best_n_estimators, eta=best_eta,
+                                                  max_depth=best_max_depth, random_state=42, verbosity=0)
+        return best_classifier_model, model_trials.best_trial
+    def fit(self, x: pd.DataFrame, y: pd.Series):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering feature_selection.fit")
+            x = x.reset_index(drop=True).copy()
+            y = y.reset_index(drop=True).copy()
+            best_classifier, _ = self.__find_best_model(x, y)
+            if isinstance(best_classifier, XGBClassifier):
+                sample_weight_params = class_weight.compute_sample_weight(class_weight="balanced", y=y)
+                _ = best_classifier.fit(x, y, sample_weight=sample_weight_params)
+            else:
+                _ = best_classifier.fit(x, y)
+            self.feature_importance = pd.Series(best_classifier.feature_importances_,
+                                                index=x.columns).sort_values(ascending=False)
+            self.selected_features = [*self.feature_importance[: self.n_features].index]
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected feature_selection.fit error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting feature_selection.fit")
+        return self
+    def transform(self, x: pd.DataFrame, y=None):
+        logger = MongoLogger()
+        try:
+            logger.log_to_db(level="INFO", message="entering feature_selection.transform")
+            x = x.reset_index(drop=True).copy()
+        except Exception as e:
+            logger.log_to_db(level="CRITICAL", message=f"unexpected feature_selection.transform error: {e}")
+            raise
+        logger.log_to_db(level="INFO", message="exiting feature_selection.transform")
+        return x[self.selected_features]
+    def get_feature_names_out(self, input_features=None):
+        return self.selected_features

logger/__init__.py ADDED Viewed

File without changes

logger/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (158 Bytes). View file

logger/__pycache__/logger.cpython-37.pyc ADDED Viewed

Binary file (1.95 kB). View file

logger/logger.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pymongo
+from datetime import datetime
+from utils.set_log_secrets_env import set_log_secrets_env
+import os
+class MongoLogger:
+    """
+    Custom logger that inserts logs into MongoDB
+    """
+    def __init__(self):
+        set_log_secrets_env()
+        self.url = os.getenv('LOGGER_URL')
+        self.database = "logger_db"
+        self.collection = "logger"
+        self.__client = None
+        self.__error = 0
+    def __connect(self):
+        try:
+            self.__client = pymongo.MongoClient(self.url)
+            _ = self.__client.list_database_names()
+        except Exception as conn_exception:
+            self.__error = 1
+            self.__client = None
+            raise
+    def __insert(self, json_log):
+        try:
+            db = self.__client[self.database]
+            coll = db[self.collection]
+            coll.insert_one(json_log)
+        except Exception as insert_err:
+            self.__error = 1
+            raise
+    def __close_connection(self):
+        if self.__client is not None:
+            self.__client.close()
+            self.__client = None
+    def log_to_db(self, level: str, message: str):
+        if self.url is not None:
+            if self.__error == 0:
+                self.__connect()
+            if self.__error == 0:
+                json_log = {"time": str(datetime.now()), "level": level, "message": message}
+                self.__insert(json_log)
+            if self.__client is not None:
+                self.__close_connection()

model_inference/__init__.py ADDED Viewed

File without changes

model_inference/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (167 Bytes). View file

model_inference/__pycache__/model_inference.cpython-37.pyc ADDED Viewed

Binary file (2.02 kB). View file

model_inference/model_inference.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from utils.load_model import load_model
+from feature_construction.feature_construction import FeatureConstructor
+import pandas as pd
+from logger.logger import MongoLogger
+def predict(x: pd.DataFrame, predict_proba: bool = True,
+            use_deployed_model: bool = True, model_file_name: str = None, predict_label: bool = False):
+    prediction = None
+    logger = MongoLogger()
+    try:
+        logger.log_to_db(level="INFO", message="entering model_inference.predict")
+        loaded_model = load_model(load_deployed_model=use_deployed_model, model_file_name=model_file_name)
+        if loaded_model is not None:
+            (label_encoder, cat_var_id_transform, cat_var_threshold_transform,
+             num_var_threshold_transform, rare_cat_transform, outlier_transform,
+             cat_missing_imputer, num_missing_imputer, one_hot_encoder,
+             ordinal_encoder, minmax_scaler, clusterer, multicoll_transform,
+             feature_selector, class_weight_flag, smote_transform, classifier, best_class_threshold) = loaded_model
+            feature_constructor = FeatureConstructor()
+            x = feature_constructor.add_features(x).copy()
+            if cat_var_id_transform is not None:
+                x = cat_var_id_transform.transform(x).copy()
+            if cat_var_threshold_transform is not None:
+                x = cat_var_threshold_transform.transform(x).copy()
+            if num_var_threshold_transform is not None:
+                x = num_var_threshold_transform.transform(x).copy()
+            if rare_cat_transform is not None:
+                x = rare_cat_transform.transform(x).copy()
+            if outlier_transform is not None:
+                x = outlier_transform.transform(x).copy()
+            if cat_missing_imputer is not None:
+                x = cat_missing_imputer.transform(x).copy()
+            if num_missing_imputer is not None:
+                x = num_missing_imputer.transform(x).copy()
+            if one_hot_encoder is not None:
+                x = one_hot_encoder.transform(x).copy()
+            if ordinal_encoder is not None:
+                x = ordinal_encoder.transform(x).copy()
+            if minmax_scaler is not None:
+                x = minmax_scaler.transform(x).copy()
+            if clusterer is not None:
+                cluster_ohe, _ = clusterer.predict(x)
+                x = pd.concat([x, cluster_ohe], axis=1)
+            if multicoll_transform is not None:
+                x = multicoll_transform.transform(x).copy()
+            if feature_selector is not None:
+                x = feature_selector.transform(x).copy()
+            if predict_proba:
+                prediction = classifier.predict_proba(x)[:, 1]
+            else:
+                prediction = (classifier.predict_proba(x)[:, 1] >= best_class_threshold).astype('int')
+                if predict_label:
+                    prediction = label_encoder.inverse_transform(prediction)
+    except Exception as e:
+        logger.log_to_db(level="CRITICAL", message=f"unexpected model_inference.predict error: {e}")
+        raise
+    logger.log_to_db(level="INFO", message="exiting model_inference.predict")
+    return prediction

models/deployed_model.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"deployed_model": "model_cold_2022_11_22"}

models/model_cold_2022_11_22/feature_importances.csv ADDED Viewed

	@@ -0,0 +1,36 @@

+feature,importance
+is_single,0.46307153
+education_num,0.064920954
+relationship_ Own-child,0.05042011
+capital_gain,0.04799759
+occupation_ Exec-managerial,0.036840715
+marital_status_ Married-civ-spouse,0.029596165
+occupation_ Prof-specialty,0.02784773
+occupation_ Other-service,0.027295496
+age,0.02378357
+capital_loss,0.01889127
+sex_ Female,0.017749619
+relationship_rare_category,0.015518844
+hours_per_week,0.015093696
+occupation_ Adm-clerical,0.0121984985
+race_ White,0.011818247
+relationship_ Not-in-family,0.011008147
+marital_status_ Never-married,0.010437393
+occupation_rare_category,0.009737839
+workclass_ Self-emp-not-inc,0.009453006
+workclass_rare_category,0.00917275
+country_ United-States,0.008190496
+race_ Black,0.0076376186
+workclass_ Private,0.007357159
+education_group_high_school,0.0066947634
+relationship_marital_ Own-child Never-married,0.0062995595
+age_of_first_edu,0.00590806
+education,0.0057953694
+education_group_college,0.0057467762
+race_rare_category,0.0057038493
+fnlwgt,0.005448137
+occupation_ Sales,0.005127732
+education_group_rare_category,0.0049574347
+occupation_ Craft-repair,0.0046369503
+relationship_marital_ Husband Married-civ-spouse,0.0040548774
+relationship_marital_rare_category,0.0035880853