Spaces:
Running
Running
Commit
•
4a58702
1
Parent(s):
3ee4302
Upload 83 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +8 -0
- api.py +69 -0
- api_requirements.txt +10 -0
- clustering/__init__.py +0 -0
- clustering/__pycache__/__init__.cpython-37.pyc +0 -0
- clustering/__pycache__/clustering.cpython-37.pyc +0 -0
- clustering/clustering.py +66 -0
- data_cleaning/__init__.py +0 -0
- data_cleaning/__pycache__/__init__.cpython-37.pyc +0 -0
- data_cleaning/__pycache__/data_cleaning.cpython-37.pyc +0 -0
- data_cleaning/data_cleaning.py +54 -0
- data_preprocessing/__init__.py +0 -0
- data_preprocessing/__pycache__/__init__.cpython-37.pyc +0 -0
- data_preprocessing/__pycache__/categorical_id_var_deletor.cpython-37.pyc +0 -0
- data_preprocessing/__pycache__/categorical_imputer.cpython-37.pyc +0 -0
- data_preprocessing/__pycache__/categorical_variance_threshold.cpython-37.pyc +0 -0
- data_preprocessing/__pycache__/multicollinearity_handler.cpython-37.pyc +0 -0
- data_preprocessing/__pycache__/numeric_imputer.cpython-37.pyc +0 -0
- data_preprocessing/__pycache__/outlier_handler.cpython-37.pyc +0 -0
- data_preprocessing/__pycache__/rare_category_encoder.cpython-37.pyc +0 -0
- data_preprocessing/categorical_id_var_deletor.py +47 -0
- data_preprocessing/categorical_imputer.py +50 -0
- data_preprocessing/categorical_variance_threshold.py +48 -0
- data_preprocessing/multicollinearity_handler.py +53 -0
- data_preprocessing/numeric_imputer.py +71 -0
- data_preprocessing/outlier_handler.py +68 -0
- data_preprocessing/rare_category_encoder.py +60 -0
- data_validation/__init__.py +0 -0
- data_validation/__pycache__/__init__.cpython-37.pyc +0 -0
- data_validation/__pycache__/data_validation.cpython-37.pyc +0 -0
- data_validation/data_validation.py +62 -0
- data_validation/input_data_specs.json +71 -0
- feature_construction/__init__.py +0 -0
- feature_construction/__pycache__/__init__.cpython-37.pyc +0 -0
- feature_construction/__pycache__/feature_construction.cpython-37.pyc +0 -0
- feature_construction/feature_construction.py +70 -0
- feature_selection/__init__.py +0 -0
- feature_selection/__pycache__/__init__.cpython-37.pyc +0 -0
- feature_selection/__pycache__/feature_selection.cpython-37.pyc +0 -0
- feature_selection/feature_selection.py +143 -0
- logger/__init__.py +0 -0
- logger/__pycache__/__init__.cpython-37.pyc +0 -0
- logger/__pycache__/logger.cpython-37.pyc +0 -0
- logger/logger.py +50 -0
- model_inference/__init__.py +0 -0
- model_inference/__pycache__/__init__.cpython-37.pyc +0 -0
- model_inference/__pycache__/model_inference.cpython-37.pyc +0 -0
- model_inference/model_inference.py +78 -0
- models/deployed_model.json +1 -0
- models/model_cold_2022_11_22/feature_importances.csv +36 -0
Dockerfile
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.7-slim
|
2 |
+
WORKDIR /api
|
3 |
+
COPY . .
|
4 |
+
COPY ./docker/api_requirements.txt ./requirements.txt
|
5 |
+
RUN rm -r ./docker
|
6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
7 |
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "5001", "--workers", "4"]
|
8 |
+
EXPOSE 5001
|
api.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
import uvicorn
|
3 |
+
from pydantic import BaseModel
|
4 |
+
import pandas as pd
|
5 |
+
from data_validation.data_validation import DataValidation
|
6 |
+
from data_cleaning.data_cleaning import DataCleaning
|
7 |
+
from model_inference.model_inference import predict
|
8 |
+
from logger.logger import MongoLogger
|
9 |
+
import traceback
|
10 |
+
|
11 |
+
app = FastAPI()
|
12 |
+
|
13 |
+
|
14 |
+
class Data(BaseModel):
|
15 |
+
"""
|
16 |
+
Data dictionary for data type validation
|
17 |
+
"""
|
18 |
+
age: int
|
19 |
+
workclass: str
|
20 |
+
fnlwgt: int
|
21 |
+
education: str
|
22 |
+
education_num: int
|
23 |
+
marital_status: str
|
24 |
+
occupation: str
|
25 |
+
relationship: str
|
26 |
+
race: str
|
27 |
+
sex: str
|
28 |
+
capital_gain: int
|
29 |
+
capital_loss: int
|
30 |
+
hours_per_week: int
|
31 |
+
country: str
|
32 |
+
|
33 |
+
|
34 |
+
@app.post("/")
|
35 |
+
def prediction(data: Data):
|
36 |
+
"""
|
37 |
+
Processes the API request and returns a prediction
|
38 |
+
"""
|
39 |
+
logger = MongoLogger()
|
40 |
+
logger.log_to_db(level="INFO", message="entering prediction_api")
|
41 |
+
try:
|
42 |
+
df = pd.DataFrame(data.dict(), index=[0]) # converting api data dict to df
|
43 |
+
dv = DataValidation(input_df=df, dataset="prediction") # validating the data
|
44 |
+
validation_status = dv.validate_data() # status of validation. 1=passed, 0=failed
|
45 |
+
|
46 |
+
if validation_status != 0:
|
47 |
+
data_cleaning = DataCleaning()
|
48 |
+
# cleaning the data
|
49 |
+
df = data_cleaning.clean_column_names(df).copy()
|
50 |
+
df = data_cleaning.shorten_column_names(df).copy()
|
51 |
+
df = data_cleaning.clean_nan(df).copy()
|
52 |
+
# calling the 'model_inference.model_inference.predict' function
|
53 |
+
pred = predict(df, predict_proba=False, predict_label=True)[0].strip()
|
54 |
+
|
55 |
+
else:
|
56 |
+
# executes when data validation fails
|
57 |
+
pred = "data validation failed"
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
# executes in case of any exception
|
61 |
+
pred = e
|
62 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected error in prediction_api: {traceback.format_exc()}")
|
63 |
+
raise
|
64 |
+
logger.log_to_db(level="INFO", message="exiting prediction_api")
|
65 |
+
return {"result": pred}
|
66 |
+
|
67 |
+
|
68 |
+
if __name__ == '__main__':
|
69 |
+
uvicorn.run(app=app, host='0.0.0.0', port=5001, workers=4)
|
api_requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scikit-learn==1.0.2
|
2 |
+
xgboost==1.6.2
|
3 |
+
fastapi==0.85.2
|
4 |
+
pydantic==1.10.2
|
5 |
+
uvicorn==0.19.0
|
6 |
+
pymongo==4.3.3
|
7 |
+
pandas==1.3.5
|
8 |
+
cloudpickle==2.2.0
|
9 |
+
hyperopt==0.2.7
|
10 |
+
imblearn==0.0
|
clustering/__init__.py
ADDED
File without changes
|
clustering/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (162 Bytes). View file
|
|
clustering/__pycache__/clustering.cpython-37.pyc
ADDED
Binary file (3.03 kB). View file
|
|
clustering/clustering.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.cluster import KMeans
|
4 |
+
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
|
5 |
+
from sklearn.metrics import silhouette_score
|
6 |
+
|
7 |
+
|
8 |
+
class AutoCluster:
|
9 |
+
"""
|
10 |
+
Returns clusters by automatically evaluating 'k' using silhouette_score.
|
11 |
+
'k' range = [min_cluster, max_cluster]
|
12 |
+
"""
|
13 |
+
def __init__(self, min_cluster: int = 2, max_cluster: int = 10, random_state: int = 42):
|
14 |
+
self.__scaler = None
|
15 |
+
self.__ohe = None
|
16 |
+
self.k = None
|
17 |
+
self.min_cluster = min_cluster
|
18 |
+
self.max_cluster = max_cluster
|
19 |
+
self.kmeans_model = None
|
20 |
+
self.random_state = random_state
|
21 |
+
|
22 |
+
def __fit_scaler(self, x: pd.DataFrame):
|
23 |
+
x = x.copy()
|
24 |
+
self.__scaler = MinMaxScaler()
|
25 |
+
self.__scaler.fit(x)
|
26 |
+
|
27 |
+
def __find_best_k(self, x: pd.DataFrame):
|
28 |
+
x = x.copy()
|
29 |
+
self.__fit_scaler(x)
|
30 |
+
x_scaled = self.__scaler.transform(x)
|
31 |
+
silhouette_scores = []
|
32 |
+
for k in range(self.min_cluster, self.max_cluster + 1):
|
33 |
+
kmeans = KMeans(n_clusters=k, random_state=self.random_state)
|
34 |
+
kmeans.fit(x_scaled)
|
35 |
+
labels = kmeans.labels_
|
36 |
+
silhouette_scores.append(silhouette_score(X=x_scaled, labels=labels, random_state=self.random_state))
|
37 |
+
self.k = self.min_cluster + np.argmax(silhouette_scores)
|
38 |
+
|
39 |
+
def __fit_one_hot_encoder(self, x: pd.DataFrame):
|
40 |
+
self.__ohe = OneHotEncoder(sparse=False)
|
41 |
+
self.__ohe.fit(x)
|
42 |
+
|
43 |
+
def fit(self, x: pd.DataFrame):
|
44 |
+
x = x.copy()
|
45 |
+
self.__find_best_k(x)
|
46 |
+
self.kmeans_model = KMeans(n_clusters=self.k, random_state=self.random_state)
|
47 |
+
x_scaled = self.__scaler.transform(x)
|
48 |
+
self.kmeans_model.fit(x_scaled)
|
49 |
+
prediction_df = pd.DataFrame({'cluster': self.kmeans_model.predict(x_scaled)})
|
50 |
+
self.__fit_one_hot_encoder(prediction_df)
|
51 |
+
return self
|
52 |
+
|
53 |
+
def predict(self, x: pd.DataFrame):
|
54 |
+
x = x.copy()
|
55 |
+
x_scaled = self.__scaler.transform(x)
|
56 |
+
prediction = self.kmeans_model.predict(x_scaled)
|
57 |
+
prediction_df = pd.DataFrame({'cluster': prediction})
|
58 |
+
prediction_ohe = pd.DataFrame(self.__ohe.transform(prediction_df), columns=self.__ohe.get_feature_names_out())
|
59 |
+
return prediction_ohe, prediction
|
60 |
+
|
61 |
+
def fit_predict(self, x: pd.DataFrame):
|
62 |
+
self.fit(x)
|
63 |
+
self.predict(x)
|
64 |
+
|
65 |
+
def __repr__(self):
|
66 |
+
return f"AutoCluster(min_cluster={self.min_cluster}, max_cluster={self.max_cluster}, random_state=42)"
|
data_cleaning/__init__.py
ADDED
File without changes
|
data_cleaning/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (165 Bytes). View file
|
|
data_cleaning/__pycache__/data_cleaning.cpython-37.pyc
ADDED
Binary file (2.88 kB). View file
|
|
data_cleaning/data_cleaning.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
import numpy as np
|
4 |
+
from logger.logger import MongoLogger
|
5 |
+
|
6 |
+
|
7 |
+
class DataCleaning:
|
8 |
+
def __init__(self):
|
9 |
+
self.logger = MongoLogger()
|
10 |
+
|
11 |
+
def clean_column_names(self, input_df: pd.DataFrame):
|
12 |
+
"""
|
13 |
+
Replaces special characters in column names with underscore. Also converts
|
14 |
+
column names into lowercase
|
15 |
+
"""
|
16 |
+
self.logger.log_to_db(level="INFO", message="entering data_cleaning.clean_column_names")
|
17 |
+
try:
|
18 |
+
df = input_df.copy()
|
19 |
+
clean_col_names = [re.sub(r"[\.\?\s]", "_", col_name.lower().strip()) for col_name in df.columns]
|
20 |
+
df.columns = clean_col_names
|
21 |
+
except Exception as e:
|
22 |
+
self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.clean_column_names error: {e}")
|
23 |
+
raise
|
24 |
+
self.logger.log_to_db(level="INFO", message="exiting data_cleaning.clean_column_names")
|
25 |
+
return df
|
26 |
+
|
27 |
+
def shorten_column_names(self, input_df: pd.DataFrame, max_len: int = 25):
|
28 |
+
"""
|
29 |
+
Shortens the column names to a specified length
|
30 |
+
"""
|
31 |
+
self.logger.log_to_db(level="INFO", message="entering data_cleaning.shorten_column_names")
|
32 |
+
try:
|
33 |
+
df = input_df.copy()
|
34 |
+
short_col_names = [col_name[:max_len] for col_name in df.columns]
|
35 |
+
df.columns = short_col_names
|
36 |
+
except Exception as e:
|
37 |
+
self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.shorten_column_names error: {e}")
|
38 |
+
raise
|
39 |
+
self.logger.log_to_db(level="INFO", message="exiting data_cleaning.shorten_column_names")
|
40 |
+
return df
|
41 |
+
|
42 |
+
def clean_nan(self, input_df: pd.DataFrame, to_replace: list = [' ?', '?', '-', '_', -1, "-1"]):
|
43 |
+
"""
|
44 |
+
Replaces special characters and 0s in data with NaN
|
45 |
+
"""
|
46 |
+
self.logger.log_to_db(level="INFO", message="entering data_cleaning.clean_nan")
|
47 |
+
try:
|
48 |
+
df = input_df.copy()
|
49 |
+
df.replace(to_replace=to_replace, value=np.nan, inplace=True, regex=False)
|
50 |
+
except Exception as e:
|
51 |
+
self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.clean_nan error: {e}")
|
52 |
+
raise
|
53 |
+
self.logger.log_to_db(level="INFO", message="exiting data_cleaning.clean_nan")
|
54 |
+
return df
|
data_preprocessing/__init__.py
ADDED
File without changes
|
data_preprocessing/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (170 Bytes). View file
|
|
data_preprocessing/__pycache__/categorical_id_var_deletor.cpython-37.pyc
ADDED
Binary file (2.52 kB). View file
|
|
data_preprocessing/__pycache__/categorical_imputer.cpython-37.pyc
ADDED
Binary file (2.7 kB). View file
|
|
data_preprocessing/__pycache__/categorical_variance_threshold.cpython-37.pyc
ADDED
Binary file (2.71 kB). View file
|
|
data_preprocessing/__pycache__/multicollinearity_handler.cpython-37.pyc
ADDED
Binary file (2.68 kB). View file
|
|
data_preprocessing/__pycache__/numeric_imputer.cpython-37.pyc
ADDED
Binary file (3.17 kB). View file
|
|
data_preprocessing/__pycache__/outlier_handler.cpython-37.pyc
ADDED
Binary file (2.77 kB). View file
|
|
data_preprocessing/__pycache__/rare_category_encoder.cpython-37.pyc
ADDED
Binary file (2.92 kB). View file
|
|
data_preprocessing/categorical_id_var_deletor.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
3 |
+
from logger.logger import MongoLogger
|
4 |
+
|
5 |
+
|
6 |
+
class CategoricalIdVarDeletor(BaseEstimator, TransformerMixin):
|
7 |
+
"""
|
8 |
+
removes columns with % of unique cats >= threshold.
|
9 |
+
if threshold=1 i.e. 100% categories are unique. so it's removed
|
10 |
+
Doesn't Remove cols with all NAs
|
11 |
+
This considers NA as separate category
|
12 |
+
"""
|
13 |
+
def __init__(self, threshold: float = 1):
|
14 |
+
self.threshold = threshold
|
15 |
+
self.selected_features = []
|
16 |
+
|
17 |
+
@staticmethod
|
18 |
+
def get_unique_cat_percent(x: pd.Series, n_rows: int):
|
19 |
+
x = x.copy()
|
20 |
+
return len(x.fillna("!@#$%This value is missing^&*(").unique()) / n_rows
|
21 |
+
|
22 |
+
def fit(self, x: pd.DataFrame):
|
23 |
+
logger = MongoLogger()
|
24 |
+
try:
|
25 |
+
logger.log_to_db(level="INFO", message="entering categorical_id_var_deletor.fit")
|
26 |
+
x = x.copy()
|
27 |
+
unique_cat_percent = x.apply(self.get_unique_cat_percent, n_rows=x.shape[0])
|
28 |
+
self.selected_features = [*unique_cat_percent[unique_cat_percent < self.threshold].index]
|
29 |
+
except Exception as e:
|
30 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_id_var_deletor.fit error: {e}")
|
31 |
+
raise
|
32 |
+
logger.log_to_db(level="INFO", message="exiting categorical_id_var_deletor.fit")
|
33 |
+
return self
|
34 |
+
|
35 |
+
def transform(self, x: pd.DataFrame, y=None):
|
36 |
+
logger = MongoLogger()
|
37 |
+
try:
|
38 |
+
logger.log_to_db(level="INFO", message="entering categorical_id_var_deletor.transform")
|
39 |
+
x = x.copy()
|
40 |
+
except Exception as e:
|
41 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_id_var_deletor.transform error: {e}")
|
42 |
+
raise
|
43 |
+
logger.log_to_db(level="INFO", message="exiting categorical_id_var_deletor.transform")
|
44 |
+
return x[self.selected_features]
|
45 |
+
|
46 |
+
def get_feature_names_out(self, input_features=None):
|
47 |
+
return self.selected_features
|
data_preprocessing/categorical_imputer.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
3 |
+
from logger.logger import MongoLogger
|
4 |
+
|
5 |
+
|
6 |
+
class CategoricalImputer(BaseEstimator, TransformerMixin):
|
7 |
+
"""
|
8 |
+
Imputes missing values in categorical features.
|
9 |
+
strategy:
|
10 |
+
- "most_frequent": imputes most frequent value (mode).
|
11 |
+
- "constant": imputes a constant named "column_name_missing".
|
12 |
+
"""
|
13 |
+
def __init__(self, strategy="most_frequent"):
|
14 |
+
self.strategy = strategy
|
15 |
+
self.fill_values = None
|
16 |
+
self.feature_names = []
|
17 |
+
|
18 |
+
def find_fill_values(self, x):
|
19 |
+
if self.strategy == 'constant':
|
20 |
+
self.fill_values = [f'{column}_missing' for column in [*x.columns]]
|
21 |
+
else:
|
22 |
+
self.fill_values = [*x.apply(lambda column: [*column.value_counts().index][0])]
|
23 |
+
|
24 |
+
def fit(self, x: pd.DataFrame, y=None):
|
25 |
+
logger = MongoLogger()
|
26 |
+
try:
|
27 |
+
logger.log_to_db(level="INFO", message="entering categorical_imputer.fit")
|
28 |
+
self.feature_names = [*x.columns]
|
29 |
+
self.find_fill_values(x)
|
30 |
+
except Exception as e:
|
31 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_imputer.fit error: {e}")
|
32 |
+
raise
|
33 |
+
logger.log_to_db(level="INFO", message="exiting categorical_imputer.fit")
|
34 |
+
return self
|
35 |
+
|
36 |
+
def transform(self, x: pd.DataFrame, y=None):
|
37 |
+
logger = MongoLogger()
|
38 |
+
try:
|
39 |
+
logger.log_to_db(level="INFO", message="entering categorical_imputer.transform")
|
40 |
+
x = x.copy()
|
41 |
+
for i, column in enumerate([*x.columns]):
|
42 |
+
x[column] = x[column].fillna(self.fill_values[i])
|
43 |
+
except Exception as e:
|
44 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_imputer.transform error: {e}")
|
45 |
+
raise
|
46 |
+
logger.log_to_db(level="INFO", message="exiting categorical_imputer.transform")
|
47 |
+
return x
|
48 |
+
|
49 |
+
def get_feature_names_out(self, input_features=None):
|
50 |
+
return self.feature_names
|
data_preprocessing/categorical_variance_threshold.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
3 |
+
from logger.logger import MongoLogger
|
4 |
+
|
5 |
+
|
6 |
+
class CategoricalVarianceThreshold(BaseEstimator, TransformerMixin):
|
7 |
+
"""
|
8 |
+
removes columns where top category value_count is > threshold.
|
9 |
+
Removes cols with all NAs
|
10 |
+
This considers NA as separate category
|
11 |
+
"""
|
12 |
+
|
13 |
+
def __init__(self, threshold: float = 0.99):
|
14 |
+
self.threshold = threshold
|
15 |
+
self.selected_features = []
|
16 |
+
|
17 |
+
@staticmethod
|
18 |
+
def __get_max_category_proportions(x: pd.DataFrame):
|
19 |
+
x = x.copy()
|
20 |
+
return x.apply(lambda col: col.fillna("!@#$%This value is missing^&*(").value_counts(normalize=True).values[0])
|
21 |
+
|
22 |
+
def fit(self, x, y=None):
|
23 |
+
logger = MongoLogger()
|
24 |
+
try:
|
25 |
+
logger.log_to_db(level="INFO", message="entering categorical_variance_threshold.fit")
|
26 |
+
x = x.copy()
|
27 |
+
max_category_proportions = self.__get_max_category_proportions(x)
|
28 |
+
self.selected_features = [*max_category_proportions[max_category_proportions < self.threshold].index]
|
29 |
+
except Exception as e:
|
30 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_variance_threshold.fit error: {e}")
|
31 |
+
raise
|
32 |
+
logger.log_to_db(level="INFO", message="exiting categorical_variance_threshold.fit")
|
33 |
+
return self
|
34 |
+
|
35 |
+
def transform(self, x, y=None):
|
36 |
+
logger = MongoLogger()
|
37 |
+
try:
|
38 |
+
logger.log_to_db(level="INFO", message="entering categorical_variance_threshold.transform")
|
39 |
+
x = x.copy()
|
40 |
+
except Exception as e:
|
41 |
+
logger.log_to_db(level="CRITICAL",
|
42 |
+
message=f"unexpected categorical_variance_threshold.transform error: {e}")
|
43 |
+
raise
|
44 |
+
logger.log_to_db(level="INFO", message="exiting categorical_variance_threshold.transform")
|
45 |
+
return x[self.selected_features]
|
46 |
+
|
47 |
+
def get_feature_names_out(self, input_features=None):
|
48 |
+
return self.selected_features
|
data_preprocessing/multicollinearity_handler.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
4 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
5 |
+
import warnings
|
6 |
+
from logger.logger import MongoLogger
|
7 |
+
warnings.filterwarnings("ignore")
|
8 |
+
|
9 |
+
|
10 |
+
class MulticollinearityHandler(BaseEstimator, TransformerMixin):
|
11 |
+
"""
|
12 |
+
Removes numeric variables having VIF > threshold
|
13 |
+
"""
|
14 |
+
def __init__(self, threshold: float = 10):
|
15 |
+
self.threshold = threshold
|
16 |
+
self.selected_features = []
|
17 |
+
|
18 |
+
@staticmethod
|
19 |
+
def get_vif(x: pd.DataFrame):
|
20 |
+
x = x.copy()
|
21 |
+
vif = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])] # calculate VIF
|
22 |
+
return vif
|
23 |
+
|
24 |
+
def fit(self, x: pd.DataFrame, y=None):
|
25 |
+
logger = MongoLogger()
|
26 |
+
try:
|
27 |
+
logger.log_to_db(level="INFO", message="entering multicollinearity_handler.fit")
|
28 |
+
x = x.copy()
|
29 |
+
vif = self.get_vif(x)
|
30 |
+
while np.max(vif) > self.threshold: # running the loop while vif > threshold
|
31 |
+
max_vif_column = x.columns[np.argmax(vif)] # idx with max vif
|
32 |
+
x = x.drop(columns=[max_vif_column]).copy() # drop column with max vif one at a time
|
33 |
+
vif = self.get_vif(x)
|
34 |
+
self.selected_features = [*x.columns] # storing remainder columns i.e. cols without multicollinearity
|
35 |
+
except Exception as e:
|
36 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected multicollinearity_handler.fit error: {e}")
|
37 |
+
raise
|
38 |
+
logger.log_to_db(level="INFO", message="exiting multicollinearity_handler.fit")
|
39 |
+
return self
|
40 |
+
|
41 |
+
def transform(self, x: pd.DataFrame, y=None):
|
42 |
+
logger = MongoLogger()
|
43 |
+
try:
|
44 |
+
logger.log_to_db(level="INFO", message="entering multicollinearity_handle.transform")
|
45 |
+
x = x.copy()
|
46 |
+
except Exception as e:
|
47 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected multicollinearity_handle.transform error: {e}")
|
48 |
+
raise
|
49 |
+
logger.log_to_db(level="INFO", message="exiting multicollinearity_handle.transform")
|
50 |
+
return x[self.selected_features]
|
51 |
+
|
52 |
+
def get_feature_names_out(self, input_features=None):
|
53 |
+
return self.selected_features
|
data_preprocessing/numeric_imputer.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
4 |
+
from logger.logger import MongoLogger
|
5 |
+
|
6 |
+
|
7 |
+
class NumericImputer(BaseEstimator, TransformerMixin):
|
8 |
+
"""
|
9 |
+
Imputes missing values in numeric features.
|
10 |
+
strategy:
|
11 |
+
- "mean": imputes mean.
|
12 |
+
- "median": imputes median.
|
13 |
+
- "lower bound": imputes lower bound of IQR method for outlier detection.
|
14 |
+
- "upper bound": imputes upper bound of IQR method for outlier detection.
|
15 |
+
Using the bound factor, lower/upper bounds can be controlled. A higher bound_factor will result in
|
16 |
+
end-of-distribution imputation. It is equivalent to adding missing category in categorical imputation.
|
17 |
+
"""
|
18 |
+
def __init__(self, strategy: str = "median", bound_factor: float = 1.5):
|
19 |
+
self.strategy = strategy
|
20 |
+
self.bound_factor = bound_factor
|
21 |
+
self.fill_values = []
|
22 |
+
self.feature_names = []
|
23 |
+
|
24 |
+
def find_bounds(self, x: pd.Series):
|
25 |
+
"""
|
26 |
+
Find lower/upper bounds using IQR method for outlier detection.
|
27 |
+
"""
|
28 |
+
q1 = x.quantile(0.25)
|
29 |
+
q3 = x.quantile(0.75)
|
30 |
+
iqr = q3 - q1
|
31 |
+
if self.strategy == "lower_bound":
|
32 |
+
bound = q1 - (self.bound_factor * iqr)
|
33 |
+
else:
|
34 |
+
bound = q3 + (self.bound_factor * iqr)
|
35 |
+
return bound
|
36 |
+
|
37 |
+
def find_fill_values(self, x: pd.Series):
|
38 |
+
if self.strategy == 'mean':
|
39 |
+
self.fill_values.append(np.mean(x.dropna()))
|
40 |
+
elif self.strategy == 'median':
|
41 |
+
self.fill_values.append(np.median(x.dropna()))
|
42 |
+
else:
|
43 |
+
self.fill_values.append(self.find_bounds(x.dropna()))
|
44 |
+
|
45 |
+
def fit(self, x: pd.DataFrame, y=None):
|
46 |
+
logger = MongoLogger()
|
47 |
+
try:
|
48 |
+
logger.log_to_db(level="INFO", message="entering numeric_imputer.fit")
|
49 |
+
self.feature_names = [*x.columns]
|
50 |
+
x.apply(self.find_fill_values)
|
51 |
+
except Exception as e:
|
52 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected numeric_imputer.fit error: {e}")
|
53 |
+
raise
|
54 |
+
logger.log_to_db(level="INFO", message="exiting numeric_imputer.fit")
|
55 |
+
return self
|
56 |
+
|
57 |
+
def transform(self, x: pd.DataFrame, y=None):
|
58 |
+
logger = MongoLogger()
|
59 |
+
try:
|
60 |
+
logger.log_to_db(level="INFO", message="entering numeric_imputer.transform")
|
61 |
+
x = x.copy()
|
62 |
+
for i, column in enumerate([*x.columns]):
|
63 |
+
x[column] = x[column].fillna(self.fill_values[i])
|
64 |
+
except Exception as e:
|
65 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected numeric_imputer.transform error: {e}")
|
66 |
+
raise
|
67 |
+
logger.log_to_db(level="INFO", message="exiting numeric_imputer.transform")
|
68 |
+
return x
|
69 |
+
|
70 |
+
def get_feature_names_out(self, input_features=None):
|
71 |
+
return self.feature_names
|
data_preprocessing/outlier_handler.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
4 |
+
from logger.logger import MongoLogger
|
5 |
+
|
6 |
+
|
7 |
+
class OutlierHandler(BaseEstimator, TransformerMixin):
|
8 |
+
"""
|
9 |
+
This transform detects outliers using IQR method. Then the outliers
|
10 |
+
are either replaced with NaN or with lower or upper bounds computed
|
11 |
+
using IQR method i.e. they are winsorized.
|
12 |
+
"""
|
13 |
+
def __init__(self, method: str = 'winsorize', factor: float = 1.5):
|
14 |
+
self.method = method
|
15 |
+
self.factor = factor
|
16 |
+
self.lower_bounds = []
|
17 |
+
self.upper_bounds = []
|
18 |
+
self.feature_names = []
|
19 |
+
|
20 |
+
def detect_bounds(self, x: pd.Series):
|
21 |
+
"""
|
22 |
+
Method to detect the lower and upper bounds using IQR method
|
23 |
+
"""
|
24 |
+
x = x.copy()
|
25 |
+
q1 = x.quantile(0.25)
|
26 |
+
q3 = x.quantile(0.75)
|
27 |
+
iqr = q3 - q1
|
28 |
+
lower_bound = q1 - (self.factor * iqr)
|
29 |
+
upper_bound = q3 + (self.factor * iqr)
|
30 |
+
self.lower_bounds.append(lower_bound)
|
31 |
+
self.upper_bounds.append(upper_bound)
|
32 |
+
|
33 |
+
def fit(self, x: pd.DataFrame, y=None):
|
34 |
+
logger = MongoLogger()
|
35 |
+
try:
|
36 |
+
logger.log_to_db(level="INFO", message="entering outlier_handler.fit")
|
37 |
+
x = x.copy()
|
38 |
+
self.feature_names = [*x.columns]
|
39 |
+
x.apply(self.detect_bounds)
|
40 |
+
except Exception as e:
|
41 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected outlier_handler.fit error: {e}")
|
42 |
+
raise
|
43 |
+
logger.log_to_db(level="INFO", message="exiting outlier_handler.fit")
|
44 |
+
return self
|
45 |
+
|
46 |
+
def transform(self, x: pd.DataFrame, y=None):
|
47 |
+
logger = MongoLogger()
|
48 |
+
try:
|
49 |
+
logger.log_to_db(level="INFO", message="entering outlier_handler.transform")
|
50 |
+
x = x.copy()
|
51 |
+
for i, column in enumerate(x.columns):
|
52 |
+
lower_bound = self.lower_bounds[i]
|
53 |
+
upper_bound = self.upper_bounds[i]
|
54 |
+
lower_repl = np.nan
|
55 |
+
upper_repl = np.nan
|
56 |
+
if self.method == 'winsorize':
|
57 |
+
lower_repl = lower_bound
|
58 |
+
upper_repl = upper_bound
|
59 |
+
x.loc[(x[column] < lower_bound), column] = lower_repl
|
60 |
+
x.loc[(x[column] > upper_bound), column] = upper_repl
|
61 |
+
except Exception as e:
|
62 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected outlier_handler.transform error: {e}")
|
63 |
+
raise
|
64 |
+
logger.log_to_db(level="INFO", message="exiting outlier_handler.transform")
|
65 |
+
return x
|
66 |
+
|
67 |
+
def get_feature_names_out(self, input_features=None):
|
68 |
+
return self.feature_names
|
data_preprocessing/rare_category_encoder.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
3 |
+
from logger.logger import MongoLogger
|
4 |
+
|
5 |
+
|
6 |
+
class RareCategoryEncoder(BaseEstimator, TransformerMixin):
|
7 |
+
"""
|
8 |
+
replaces rare categories with rare_category.
|
9 |
+
Any unknown category in test set is also replaced with rare_category.
|
10 |
+
stores the frequent categories.
|
11 |
+
NAs are ignored (as a col with single value 'a' and remaining NAs). 'a' can't be considered rare
|
12 |
+
as it is the only value available, it can't be rare.
|
13 |
+
This method doesn't touch NAs. They are left as they are.
|
14 |
+
"""
|
15 |
+
def __init__(self, threshold: float = 0.05, replace_value: str = 'rare_category'):
|
16 |
+
self.threshold = threshold
|
17 |
+
self.replace_value = replace_value
|
18 |
+
self.feature_names = []
|
19 |
+
self.frequent_cat_list = []
|
20 |
+
|
21 |
+
def __frequent_category_detector(self, x: pd.Series, y=None):
|
22 |
+
x = x.copy()
|
23 |
+
val_counts = x.value_counts(normalize=True)
|
24 |
+
# frequent categories in a column are the ones whose frequency > threshold
|
25 |
+
frequent_cats = [*val_counts[val_counts > self.threshold].index]
|
26 |
+
self.frequent_cat_list.append(frequent_cats)
|
27 |
+
|
28 |
+
def fit(self, x: pd.DataFrame, y=None):
|
29 |
+
logger = MongoLogger()
|
30 |
+
try:
|
31 |
+
logger.log_to_db(level="INFO", message="entering rare_category_encoder.fit")
|
32 |
+
x = x.copy()
|
33 |
+
self.feature_names = [*x.columns]
|
34 |
+
x.apply(self.__frequent_category_detector)
|
35 |
+
except Exception as e:
|
36 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected rare_category_encoder.fit error: {e}")
|
37 |
+
raise
|
38 |
+
logger.log_to_db(level="INFO", message="exiting rare_category_encoder.fit")
|
39 |
+
return self
|
40 |
+
|
41 |
+
def transform(self, x: pd.DataFrame, y=None):
|
42 |
+
logger = MongoLogger()
|
43 |
+
try:
|
44 |
+
logger.log_to_db(level="INFO", message="entering rare_category_encoder.transform")
|
45 |
+
x = x.copy()
|
46 |
+
for i in range(x.shape[1]):
|
47 |
+
x_ser = x.iloc[:, i].copy()
|
48 |
+
# replacing categories in each column, not in frequent list and not NANs with replace_value
|
49 |
+
x_ser[(x_ser.isin(self.frequent_cat_list[i]) == False) &
|
50 |
+
(x_ser.isna() == False) &
|
51 |
+
(x_ser.isnull() == False)] = self.replace_value
|
52 |
+
x.iloc[:, i] = x_ser
|
53 |
+
except Exception as e:
|
54 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected rare_category_encoder.transform error: {e}")
|
55 |
+
raise
|
56 |
+
logger.log_to_db(level="INFO", message="exiting rare_category_encoder.transform")
|
57 |
+
return x
|
58 |
+
|
59 |
+
def get_feature_names_out(self, input_features=None):
|
60 |
+
return self.feature_names
|
data_validation/__init__.py
ADDED
File without changes
|
data_validation/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (167 Bytes). View file
|
|
data_validation/__pycache__/data_validation.cpython-37.pyc
ADDED
Binary file (2.27 kB). View file
|
|
data_validation/data_validation.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.json_parser import JSONParser
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
from logger.logger import MongoLogger
|
5 |
+
|
6 |
+
|
7 |
+
class DataValidation:
|
8 |
+
"""
|
9 |
+
- Class to validate input data as per the data dictionary. It validates the column count,
|
10 |
+
column data types and column names.
|
11 |
+
- Returns 1 if data is valid else 0
|
12 |
+
- dataset: str
|
13 |
+
- train: for train set
|
14 |
+
- test: for test set
|
15 |
+
- prediction: for single sample inference / batch inference
|
16 |
+
"""
|
17 |
+
def __init__(self, input_df: pd.DataFrame, dataset: str = "train"):
|
18 |
+
self.input_df = input_df
|
19 |
+
self.dataset = dataset
|
20 |
+
|
21 |
+
def validate_data(self):
|
22 |
+
logger = MongoLogger()
|
23 |
+
try:
|
24 |
+
logger.log_to_db(level="INFO", message="Entering data_validation")
|
25 |
+
status = 1
|
26 |
+
# parsing input data specification JSON
|
27 |
+
json_parser = JSONParser(os.path.join('.', "data_validation", 'input_data_specs.json'))
|
28 |
+
input_data_specs_dict = json_parser.parse_json()
|
29 |
+
# column specs for train and test data
|
30 |
+
column_count_key = 'train_test_column_count'
|
31 |
+
column_name_key = 'train_test_column_names'
|
32 |
+
column_types_key = 'train_test_column_dtypes'
|
33 |
+
# column specs for prediction data. Prediction data shouldn't have target (salary) column
|
34 |
+
if self.dataset == "prediction":
|
35 |
+
column_count_key = 'prediction_column_count'
|
36 |
+
column_name_key = 'prediction_column_names'
|
37 |
+
column_types_key = 'prediction_column_dtypes'
|
38 |
+
|
39 |
+
n_cols = input_data_specs_dict[column_count_key]
|
40 |
+
col_names = input_data_specs_dict[column_name_key]
|
41 |
+
col_dtypes = input_data_specs_dict[column_types_key]
|
42 |
+
|
43 |
+
if len(self.input_df.columns) != n_cols:
|
44 |
+
status = 0
|
45 |
+
logger.log_to_db(level="CRITICAL",
|
46 |
+
message=f"{self.dataset} data_validation failed: column count doesn't match")
|
47 |
+
|
48 |
+
if col_names != [*self.input_df.columns]:
|
49 |
+
status = 0
|
50 |
+
logger.log_to_db(level="CRITICAL",
|
51 |
+
message=f"{self.dataset} data_validation failed: column names don't match")
|
52 |
+
|
53 |
+
if col_dtypes != self.input_df.dtypes.tolist():
|
54 |
+
status = 0
|
55 |
+
logger.log_to_db(level="CRITICAL",
|
56 |
+
message=f"{self.dataset} data_validation failed: column dtypes don't match")
|
57 |
+
except Exception as e:
|
58 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected data_validation error: {e}")
|
59 |
+
raise
|
60 |
+
|
61 |
+
logger.log_to_db(level="INFO", message="exiting data_validation")
|
62 |
+
return status
|
data_validation/input_data_specs.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train_test_column_count": 15,
|
3 |
+
"train_test_column_names": [
|
4 |
+
"age",
|
5 |
+
"workclass",
|
6 |
+
"fnlwgt",
|
7 |
+
"education",
|
8 |
+
"education_num",
|
9 |
+
"marital_status",
|
10 |
+
"occupation",
|
11 |
+
"relationship",
|
12 |
+
"race",
|
13 |
+
"sex",
|
14 |
+
"capital_gain",
|
15 |
+
"capital_loss",
|
16 |
+
"hours_per_week",
|
17 |
+
"country",
|
18 |
+
"salary"
|
19 |
+
|
20 |
+
],
|
21 |
+
"train_test_column_dtypes": [
|
22 |
+
"int64",
|
23 |
+
"O",
|
24 |
+
"int64",
|
25 |
+
"O",
|
26 |
+
"int64",
|
27 |
+
"O",
|
28 |
+
"O",
|
29 |
+
"O",
|
30 |
+
"O",
|
31 |
+
"O",
|
32 |
+
"int64",
|
33 |
+
"int64",
|
34 |
+
"int64",
|
35 |
+
"O",
|
36 |
+
"O"
|
37 |
+
],
|
38 |
+
"prediction_column_count": 14,
|
39 |
+
"prediction_column_names": [
|
40 |
+
"age",
|
41 |
+
"workclass",
|
42 |
+
"fnlwgt",
|
43 |
+
"education",
|
44 |
+
"education_num",
|
45 |
+
"marital_status",
|
46 |
+
"occupation",
|
47 |
+
"relationship",
|
48 |
+
"race",
|
49 |
+
"sex",
|
50 |
+
"capital_gain",
|
51 |
+
"capital_loss",
|
52 |
+
"hours_per_week",
|
53 |
+
"country"
|
54 |
+
],
|
55 |
+
"prediction_column_dtypes": [
|
56 |
+
"int64",
|
57 |
+
"O",
|
58 |
+
"int64",
|
59 |
+
"O",
|
60 |
+
"int64",
|
61 |
+
"O",
|
62 |
+
"O",
|
63 |
+
"O",
|
64 |
+
"O",
|
65 |
+
"O",
|
66 |
+
"int64",
|
67 |
+
"int64",
|
68 |
+
"int64",
|
69 |
+
"O"
|
70 |
+
]
|
71 |
+
}
|
feature_construction/__init__.py
ADDED
File without changes
|
feature_construction/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (172 Bytes). View file
|
|
feature_construction/__pycache__/feature_construction.cpython-37.pyc
ADDED
Binary file (1.71 kB). View file
|
|
feature_construction/feature_construction.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from logger.logger import MongoLogger
|
3 |
+
|
4 |
+
|
5 |
+
class FeatureConstructor:
|
6 |
+
"""
|
7 |
+
Adds new engineered features to input data
|
8 |
+
"""
|
9 |
+
@staticmethod
|
10 |
+
def add_features(x: pd.DataFrame):
|
11 |
+
logger = MongoLogger()
|
12 |
+
try:
|
13 |
+
logger.log_to_db(level="INFO", message="entering add_features")
|
14 |
+
x = x.copy()
|
15 |
+
|
16 |
+
education_group = {' 5th-6th': 'middle_school',
|
17 |
+
' 7th-8th': 'middle_school',
|
18 |
+
' 9th': 'middle_school',
|
19 |
+
' 10th': 'high_school',
|
20 |
+
' 11th': 'high_school',
|
21 |
+
' 12th': 'high_school',
|
22 |
+
' HS-grad': 'hs_grad',
|
23 |
+
' Prof-school': 'high_school',
|
24 |
+
' Some-college': 'college',
|
25 |
+
' Masters': 'college',
|
26 |
+
' Bachelors': 'college',
|
27 |
+
' 1st-4th': 'primary_school',
|
28 |
+
' Preschool': 'primary_school',
|
29 |
+
' Assoc-voc': 'college',
|
30 |
+
' Assoc-acdm': 'college',
|
31 |
+
' Doctorate': 'doctorate'}
|
32 |
+
x['education_group'] = x['education'].map(education_group)
|
33 |
+
|
34 |
+
# workclass group has ver low mutual info score so removing it
|
35 |
+
|
36 |
+
# workclass_group = {' Federal-gov': 'government',
|
37 |
+
# ' Local-gov': 'government',
|
38 |
+
# ' State-gov': 'government',
|
39 |
+
# ' Private': 'private',
|
40 |
+
# ' Self-emp-inc': 'self_emp',
|
41 |
+
# ' Self-emp-not-inc': 'self_emp',
|
42 |
+
# ' Never-worked': 'no_work',
|
43 |
+
# ' Without-pay': 'no_work'}
|
44 |
+
# x['workclass_group'] = x['workclass'].map(workclass_group)
|
45 |
+
|
46 |
+
is_single = {' Divorced': 1,
|
47 |
+
' Married-spouse-absent': 1,
|
48 |
+
' Never-married': 1,
|
49 |
+
' Separated': 1,
|
50 |
+
' Widowed': 1,
|
51 |
+
' Married-AF-spouse': 0,
|
52 |
+
' Married-civ-spouse': 0}
|
53 |
+
x['is_single'] = x['marital_status'].map(is_single)
|
54 |
+
|
55 |
+
x['relationship_marital'] = x['relationship'] + x['marital_status']
|
56 |
+
|
57 |
+
# x.loc[(x['capital_gain'] > 5000), 'capital_gain'] = 5000
|
58 |
+
|
59 |
+
x['has_capital_gain'] = (x['capital_gain'] > 0).astype("int")
|
60 |
+
|
61 |
+
# x['has_capital_loss'] = (x['capital_loss'] > 0).astype("int")
|
62 |
+
|
63 |
+
x['age_of_first_edu'] = x['age'] - x['education_num']
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected add_features error: {e}")
|
67 |
+
raise
|
68 |
+
logger.log_to_db(level="INFO", message="exiting add_features")
|
69 |
+
|
70 |
+
return x
|
feature_selection/__init__.py
ADDED
File without changes
|
feature_selection/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (169 Bytes). View file
|
|
feature_selection/__pycache__/feature_selection.cpython-37.pyc
ADDED
Binary file (5.42 kB). View file
|
|
feature_selection/feature_selection.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
|
4 |
+
from sklearn.ensemble import RandomForestClassifier
|
5 |
+
from xgboost import XGBClassifier
|
6 |
+
from sklearn.metrics import roc_auc_score
|
7 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
8 |
+
from sklearn.model_selection import StratifiedKFold
|
9 |
+
from utils.find_class_weights import find_class_weights
|
10 |
+
from sklearn.utils import class_weight
|
11 |
+
from logger.logger import MongoLogger
|
12 |
+
|
13 |
+
|
14 |
+
class FeatureSelector(BaseEstimator, TransformerMixin):
|
15 |
+
"""
|
16 |
+
- Feature selector selects n_features by training RandomForestClassifier and XGBClassifier.
|
17 |
+
- It uses hyperopt hyperparameter tuning to choose the best model from the two.
|
18 |
+
- It uses feature importance of the best model to select features.
|
19 |
+
"""
|
20 |
+
def __init__(self, n_features: int, n_trials: int = 10, cv_splits: int = 2):
|
21 |
+
self.n_features = int(n_features)
|
22 |
+
self.n_trials = n_trials
|
23 |
+
self.cv_splits = cv_splits
|
24 |
+
self.selected_features = []
|
25 |
+
self.feature_importance = None
|
26 |
+
|
27 |
+
def __find_best_model(self, x: pd.DataFrame, y: pd.Series):
|
28 |
+
x = x.reset_index(drop=True).copy()
|
29 |
+
y = y.reset_index(drop=True).copy()
|
30 |
+
classifier_params = ['random_forest', 'xgboost']
|
31 |
+
max_depth_params = [3, 4, 5, 6, 7, 8, 9, 10]
|
32 |
+
n_estimators_params = [50, 100, 200, 300, 500]
|
33 |
+
eta_params = [0.1, 0.3, 0.01, 0.001, 0.0001, 1]
|
34 |
+
search_space = {'classifier': hp.choice('classifier', [
|
35 |
+
{
|
36 |
+
'type': classifier_params[0],
|
37 |
+
'max_depth': hp.choice('rf_max_depth', max_depth_params),
|
38 |
+
'n_estimators': hp.choice('rf_n_estimators', n_estimators_params)
|
39 |
+
},
|
40 |
+
{
|
41 |
+
'type': classifier_params[1],
|
42 |
+
'max_depth': hp.choice('xgb_max_depth', max_depth_params),
|
43 |
+
'n_estimators': hp.choice('xgb_n_estimators', n_estimators_params),
|
44 |
+
'eta': hp.choice('xgb_eta', eta_params)
|
45 |
+
}
|
46 |
+
])}
|
47 |
+
|
48 |
+
def objective(params):
|
49 |
+
classifier = None
|
50 |
+
max_depth = params['classifier']['max_depth']
|
51 |
+
n_estimators = params['classifier']['n_estimators']
|
52 |
+
train_scores = []
|
53 |
+
val_scores = []
|
54 |
+
skfold = StratifiedKFold(n_splits=self.cv_splits)
|
55 |
+
for train_idx, test_idx in skfold.split(x, y):
|
56 |
+
if params['classifier']['type'] == classifier_params[0]: # rf
|
57 |
+
class_weight_params = find_class_weights(y=y[train_idx])
|
58 |
+
classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=42,
|
59 |
+
class_weight=class_weight_params)
|
60 |
+
classifier.fit(x.iloc[train_idx], y[train_idx])
|
61 |
+
|
62 |
+
if params['classifier']['type'] == classifier_params[1]: # xgboost
|
63 |
+
eta = params['classifier']['eta']
|
64 |
+
sample_weight_params = class_weight.compute_sample_weight(class_weight="balanced", y=y[train_idx])
|
65 |
+
classifier = XGBClassifier(n_estimators=n_estimators, eta=eta,
|
66 |
+
max_depth=max_depth, random_state=42, verbosity=0)
|
67 |
+
classifier.fit(x.iloc[train_idx], y[train_idx], sample_weight=sample_weight_params)
|
68 |
+
|
69 |
+
train_scores.append(roc_auc_score(y[train_idx],
|
70 |
+
classifier.predict_proba(x.iloc[train_idx])[:, 1]))
|
71 |
+
val_scores.append(roc_auc_score(y[test_idx],
|
72 |
+
classifier.predict_proba(x.iloc[test_idx])[:, 1]))
|
73 |
+
|
74 |
+
avg_train_score = np.mean(train_scores)
|
75 |
+
avg_val_score = np.mean(val_scores)
|
76 |
+
return {'loss': -avg_val_score, 'train_score': avg_train_score, 'val_score': avg_val_score,
|
77 |
+
'status': STATUS_OK}
|
78 |
+
|
79 |
+
model_trials = Trials()
|
80 |
+
model_best = fmin(
|
81 |
+
fn=objective,
|
82 |
+
space=search_space,
|
83 |
+
algo=tpe.suggest,
|
84 |
+
max_evals=self.n_trials,
|
85 |
+
catch_eval_exceptions=False,
|
86 |
+
verbose=False,
|
87 |
+
trials=model_trials
|
88 |
+
)
|
89 |
+
|
90 |
+
best_classifier_name = classifier_params[model_best['classifier']]
|
91 |
+
best_classifier_model = None
|
92 |
+
if best_classifier_name == classifier_params[0]: # rf
|
93 |
+
class_weight_params = find_class_weights(y=y)
|
94 |
+
best_max_depth = max_depth_params[model_best['rf_max_depth']]
|
95 |
+
best_n_estimators = n_estimators_params[model_best['rf_n_estimators']]
|
96 |
+
best_classifier_model = RandomForestClassifier(max_depth=best_max_depth,
|
97 |
+
n_estimators=best_n_estimators,
|
98 |
+
random_state=42,
|
99 |
+
class_weight=class_weight_params)
|
100 |
+
|
101 |
+
if best_classifier_name == classifier_params[1]: # xgboost
|
102 |
+
best_n_estimators = n_estimators_params[model_best['xgb_n_estimators']]
|
103 |
+
best_max_depth = max_depth_params[model_best['xgb_max_depth']]
|
104 |
+
best_eta = eta_params[model_best['xgb_eta']]
|
105 |
+
best_classifier_model = XGBClassifier(n_estimators=best_n_estimators, eta=best_eta,
|
106 |
+
max_depth=best_max_depth, random_state=42, verbosity=0)
|
107 |
+
|
108 |
+
return best_classifier_model, model_trials.best_trial
|
109 |
+
|
110 |
+
def fit(self, x: pd.DataFrame, y: pd.Series):
|
111 |
+
logger = MongoLogger()
|
112 |
+
try:
|
113 |
+
logger.log_to_db(level="INFO", message="entering feature_selection.fit")
|
114 |
+
x = x.reset_index(drop=True).copy()
|
115 |
+
y = y.reset_index(drop=True).copy()
|
116 |
+
best_classifier, _ = self.__find_best_model(x, y)
|
117 |
+
if isinstance(best_classifier, XGBClassifier):
|
118 |
+
sample_weight_params = class_weight.compute_sample_weight(class_weight="balanced", y=y)
|
119 |
+
_ = best_classifier.fit(x, y, sample_weight=sample_weight_params)
|
120 |
+
else:
|
121 |
+
_ = best_classifier.fit(x, y)
|
122 |
+
self.feature_importance = pd.Series(best_classifier.feature_importances_,
|
123 |
+
index=x.columns).sort_values(ascending=False)
|
124 |
+
self.selected_features = [*self.feature_importance[: self.n_features].index]
|
125 |
+
except Exception as e:
|
126 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected feature_selection.fit error: {e}")
|
127 |
+
raise
|
128 |
+
logger.log_to_db(level="INFO", message="exiting feature_selection.fit")
|
129 |
+
return self
|
130 |
+
|
131 |
+
def transform(self, x: pd.DataFrame, y=None):
|
132 |
+
logger = MongoLogger()
|
133 |
+
try:
|
134 |
+
logger.log_to_db(level="INFO", message="entering feature_selection.transform")
|
135 |
+
x = x.reset_index(drop=True).copy()
|
136 |
+
except Exception as e:
|
137 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected feature_selection.transform error: {e}")
|
138 |
+
raise
|
139 |
+
logger.log_to_db(level="INFO", message="exiting feature_selection.transform")
|
140 |
+
return x[self.selected_features]
|
141 |
+
|
142 |
+
def get_feature_names_out(self, input_features=None):
|
143 |
+
return self.selected_features
|
logger/__init__.py
ADDED
File without changes
|
logger/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (158 Bytes). View file
|
|
logger/__pycache__/logger.cpython-37.pyc
ADDED
Binary file (1.95 kB). View file
|
|
logger/logger.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymongo
|
2 |
+
from datetime import datetime
|
3 |
+
from utils.set_log_secrets_env import set_log_secrets_env
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
class MongoLogger:
|
8 |
+
"""
|
9 |
+
Custom logger that inserts logs into MongoDB
|
10 |
+
"""
|
11 |
+
def __init__(self):
|
12 |
+
set_log_secrets_env()
|
13 |
+
self.url = os.getenv('LOGGER_URL')
|
14 |
+
self.database = "logger_db"
|
15 |
+
self.collection = "logger"
|
16 |
+
self.__client = None
|
17 |
+
self.__error = 0
|
18 |
+
|
19 |
+
def __connect(self):
|
20 |
+
try:
|
21 |
+
self.__client = pymongo.MongoClient(self.url)
|
22 |
+
_ = self.__client.list_database_names()
|
23 |
+
except Exception as conn_exception:
|
24 |
+
self.__error = 1
|
25 |
+
self.__client = None
|
26 |
+
raise
|
27 |
+
|
28 |
+
def __insert(self, json_log):
|
29 |
+
try:
|
30 |
+
db = self.__client[self.database]
|
31 |
+
coll = db[self.collection]
|
32 |
+
coll.insert_one(json_log)
|
33 |
+
except Exception as insert_err:
|
34 |
+
self.__error = 1
|
35 |
+
raise
|
36 |
+
|
37 |
+
def __close_connection(self):
|
38 |
+
if self.__client is not None:
|
39 |
+
self.__client.close()
|
40 |
+
self.__client = None
|
41 |
+
|
42 |
+
def log_to_db(self, level: str, message: str):
|
43 |
+
if self.url is not None:
|
44 |
+
if self.__error == 0:
|
45 |
+
self.__connect()
|
46 |
+
if self.__error == 0:
|
47 |
+
json_log = {"time": str(datetime.now()), "level": level, "message": message}
|
48 |
+
self.__insert(json_log)
|
49 |
+
if self.__client is not None:
|
50 |
+
self.__close_connection()
|
model_inference/__init__.py
ADDED
File without changes
|
model_inference/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (167 Bytes). View file
|
|
model_inference/__pycache__/model_inference.cpython-37.pyc
ADDED
Binary file (2.02 kB). View file
|
|
model_inference/model_inference.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.load_model import load_model
|
2 |
+
from feature_construction.feature_construction import FeatureConstructor
|
3 |
+
import pandas as pd
|
4 |
+
from logger.logger import MongoLogger
|
5 |
+
|
6 |
+
|
7 |
+
def predict(x: pd.DataFrame, predict_proba: bool = True,
|
8 |
+
use_deployed_model: bool = True, model_file_name: str = None, predict_label: bool = False):
|
9 |
+
prediction = None
|
10 |
+
logger = MongoLogger()
|
11 |
+
try:
|
12 |
+
logger.log_to_db(level="INFO", message="entering model_inference.predict")
|
13 |
+
|
14 |
+
loaded_model = load_model(load_deployed_model=use_deployed_model, model_file_name=model_file_name)
|
15 |
+
|
16 |
+
if loaded_model is not None:
|
17 |
+
(label_encoder, cat_var_id_transform, cat_var_threshold_transform,
|
18 |
+
num_var_threshold_transform, rare_cat_transform, outlier_transform,
|
19 |
+
cat_missing_imputer, num_missing_imputer, one_hot_encoder,
|
20 |
+
ordinal_encoder, minmax_scaler, clusterer, multicoll_transform,
|
21 |
+
feature_selector, class_weight_flag, smote_transform, classifier, best_class_threshold) = loaded_model
|
22 |
+
|
23 |
+
feature_constructor = FeatureConstructor()
|
24 |
+
x = feature_constructor.add_features(x).copy()
|
25 |
+
|
26 |
+
if cat_var_id_transform is not None:
|
27 |
+
x = cat_var_id_transform.transform(x).copy()
|
28 |
+
|
29 |
+
if cat_var_threshold_transform is not None:
|
30 |
+
x = cat_var_threshold_transform.transform(x).copy()
|
31 |
+
|
32 |
+
if num_var_threshold_transform is not None:
|
33 |
+
x = num_var_threshold_transform.transform(x).copy()
|
34 |
+
|
35 |
+
if rare_cat_transform is not None:
|
36 |
+
x = rare_cat_transform.transform(x).copy()
|
37 |
+
|
38 |
+
if outlier_transform is not None:
|
39 |
+
x = outlier_transform.transform(x).copy()
|
40 |
+
|
41 |
+
if cat_missing_imputer is not None:
|
42 |
+
x = cat_missing_imputer.transform(x).copy()
|
43 |
+
|
44 |
+
if num_missing_imputer is not None:
|
45 |
+
x = num_missing_imputer.transform(x).copy()
|
46 |
+
|
47 |
+
if one_hot_encoder is not None:
|
48 |
+
x = one_hot_encoder.transform(x).copy()
|
49 |
+
|
50 |
+
if ordinal_encoder is not None:
|
51 |
+
x = ordinal_encoder.transform(x).copy()
|
52 |
+
|
53 |
+
if minmax_scaler is not None:
|
54 |
+
x = minmax_scaler.transform(x).copy()
|
55 |
+
|
56 |
+
if clusterer is not None:
|
57 |
+
cluster_ohe, _ = clusterer.predict(x)
|
58 |
+
x = pd.concat([x, cluster_ohe], axis=1)
|
59 |
+
|
60 |
+
if multicoll_transform is not None:
|
61 |
+
x = multicoll_transform.transform(x).copy()
|
62 |
+
|
63 |
+
if feature_selector is not None:
|
64 |
+
x = feature_selector.transform(x).copy()
|
65 |
+
|
66 |
+
if predict_proba:
|
67 |
+
prediction = classifier.predict_proba(x)[:, 1]
|
68 |
+
else:
|
69 |
+
prediction = (classifier.predict_proba(x)[:, 1] >= best_class_threshold).astype('int')
|
70 |
+
if predict_label:
|
71 |
+
prediction = label_encoder.inverse_transform(prediction)
|
72 |
+
|
73 |
+
except Exception as e:
|
74 |
+
logger.log_to_db(level="CRITICAL", message=f"unexpected model_inference.predict error: {e}")
|
75 |
+
raise
|
76 |
+
logger.log_to_db(level="INFO", message="exiting model_inference.predict")
|
77 |
+
|
78 |
+
return prediction
|
models/deployed_model.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"deployed_model": "model_cold_2022_11_22"}
|
models/model_cold_2022_11_22/feature_importances.csv
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
feature,importance
|
2 |
+
is_single,0.46307153
|
3 |
+
education_num,0.064920954
|
4 |
+
relationship_ Own-child,0.05042011
|
5 |
+
capital_gain,0.04799759
|
6 |
+
occupation_ Exec-managerial,0.036840715
|
7 |
+
marital_status_ Married-civ-spouse,0.029596165
|
8 |
+
occupation_ Prof-specialty,0.02784773
|
9 |
+
occupation_ Other-service,0.027295496
|
10 |
+
age,0.02378357
|
11 |
+
capital_loss,0.01889127
|
12 |
+
sex_ Female,0.017749619
|
13 |
+
relationship_rare_category,0.015518844
|
14 |
+
hours_per_week,0.015093696
|
15 |
+
occupation_ Adm-clerical,0.0121984985
|
16 |
+
race_ White,0.011818247
|
17 |
+
relationship_ Not-in-family,0.011008147
|
18 |
+
marital_status_ Never-married,0.010437393
|
19 |
+
occupation_rare_category,0.009737839
|
20 |
+
workclass_ Self-emp-not-inc,0.009453006
|
21 |
+
workclass_rare_category,0.00917275
|
22 |
+
country_ United-States,0.008190496
|
23 |
+
race_ Black,0.0076376186
|
24 |
+
workclass_ Private,0.007357159
|
25 |
+
education_group_high_school,0.0066947634
|
26 |
+
relationship_marital_ Own-child Never-married,0.0062995595
|
27 |
+
age_of_first_edu,0.00590806
|
28 |
+
education,0.0057953694
|
29 |
+
education_group_college,0.0057467762
|
30 |
+
race_rare_category,0.0057038493
|
31 |
+
fnlwgt,0.005448137
|
32 |
+
occupation_ Sales,0.005127732
|
33 |
+
education_group_rare_category,0.0049574347
|
34 |
+
occupation_ Craft-repair,0.0046369503
|
35 |
+
relationship_marital_ Husband Married-civ-spouse,0.0040548774
|
36 |
+
relationship_marital_rare_category,0.0035880853
|