manivannan023 commited on
Commit
4a58702
1 Parent(s): 3ee4302

Upload 83 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +8 -0
  2. api.py +69 -0
  3. api_requirements.txt +10 -0
  4. clustering/__init__.py +0 -0
  5. clustering/__pycache__/__init__.cpython-37.pyc +0 -0
  6. clustering/__pycache__/clustering.cpython-37.pyc +0 -0
  7. clustering/clustering.py +66 -0
  8. data_cleaning/__init__.py +0 -0
  9. data_cleaning/__pycache__/__init__.cpython-37.pyc +0 -0
  10. data_cleaning/__pycache__/data_cleaning.cpython-37.pyc +0 -0
  11. data_cleaning/data_cleaning.py +54 -0
  12. data_preprocessing/__init__.py +0 -0
  13. data_preprocessing/__pycache__/__init__.cpython-37.pyc +0 -0
  14. data_preprocessing/__pycache__/categorical_id_var_deletor.cpython-37.pyc +0 -0
  15. data_preprocessing/__pycache__/categorical_imputer.cpython-37.pyc +0 -0
  16. data_preprocessing/__pycache__/categorical_variance_threshold.cpython-37.pyc +0 -0
  17. data_preprocessing/__pycache__/multicollinearity_handler.cpython-37.pyc +0 -0
  18. data_preprocessing/__pycache__/numeric_imputer.cpython-37.pyc +0 -0
  19. data_preprocessing/__pycache__/outlier_handler.cpython-37.pyc +0 -0
  20. data_preprocessing/__pycache__/rare_category_encoder.cpython-37.pyc +0 -0
  21. data_preprocessing/categorical_id_var_deletor.py +47 -0
  22. data_preprocessing/categorical_imputer.py +50 -0
  23. data_preprocessing/categorical_variance_threshold.py +48 -0
  24. data_preprocessing/multicollinearity_handler.py +53 -0
  25. data_preprocessing/numeric_imputer.py +71 -0
  26. data_preprocessing/outlier_handler.py +68 -0
  27. data_preprocessing/rare_category_encoder.py +60 -0
  28. data_validation/__init__.py +0 -0
  29. data_validation/__pycache__/__init__.cpython-37.pyc +0 -0
  30. data_validation/__pycache__/data_validation.cpython-37.pyc +0 -0
  31. data_validation/data_validation.py +62 -0
  32. data_validation/input_data_specs.json +71 -0
  33. feature_construction/__init__.py +0 -0
  34. feature_construction/__pycache__/__init__.cpython-37.pyc +0 -0
  35. feature_construction/__pycache__/feature_construction.cpython-37.pyc +0 -0
  36. feature_construction/feature_construction.py +70 -0
  37. feature_selection/__init__.py +0 -0
  38. feature_selection/__pycache__/__init__.cpython-37.pyc +0 -0
  39. feature_selection/__pycache__/feature_selection.cpython-37.pyc +0 -0
  40. feature_selection/feature_selection.py +143 -0
  41. logger/__init__.py +0 -0
  42. logger/__pycache__/__init__.cpython-37.pyc +0 -0
  43. logger/__pycache__/logger.cpython-37.pyc +0 -0
  44. logger/logger.py +50 -0
  45. model_inference/__init__.py +0 -0
  46. model_inference/__pycache__/__init__.cpython-37.pyc +0 -0
  47. model_inference/__pycache__/model_inference.cpython-37.pyc +0 -0
  48. model_inference/model_inference.py +78 -0
  49. models/deployed_model.json +1 -0
  50. models/model_cold_2022_11_22/feature_importances.csv +36 -0
Dockerfile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7-slim
2
+ WORKDIR /api
3
+ COPY . .
4
+ COPY ./docker/api_requirements.txt ./requirements.txt
5
+ RUN rm -r ./docker
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "5001", "--workers", "4"]
8
+ EXPOSE 5001
api.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import uvicorn
3
+ from pydantic import BaseModel
4
+ import pandas as pd
5
+ from data_validation.data_validation import DataValidation
6
+ from data_cleaning.data_cleaning import DataCleaning
7
+ from model_inference.model_inference import predict
8
+ from logger.logger import MongoLogger
9
+ import traceback
10
+
11
+ app = FastAPI()
12
+
13
+
14
+ class Data(BaseModel):
15
+ """
16
+ Data dictionary for data type validation
17
+ """
18
+ age: int
19
+ workclass: str
20
+ fnlwgt: int
21
+ education: str
22
+ education_num: int
23
+ marital_status: str
24
+ occupation: str
25
+ relationship: str
26
+ race: str
27
+ sex: str
28
+ capital_gain: int
29
+ capital_loss: int
30
+ hours_per_week: int
31
+ country: str
32
+
33
+
34
+ @app.post("/")
35
+ def prediction(data: Data):
36
+ """
37
+ Processes the API request and returns a prediction
38
+ """
39
+ logger = MongoLogger()
40
+ logger.log_to_db(level="INFO", message="entering prediction_api")
41
+ try:
42
+ df = pd.DataFrame(data.dict(), index=[0]) # converting api data dict to df
43
+ dv = DataValidation(input_df=df, dataset="prediction") # validating the data
44
+ validation_status = dv.validate_data() # status of validation. 1=passed, 0=failed
45
+
46
+ if validation_status != 0:
47
+ data_cleaning = DataCleaning()
48
+ # cleaning the data
49
+ df = data_cleaning.clean_column_names(df).copy()
50
+ df = data_cleaning.shorten_column_names(df).copy()
51
+ df = data_cleaning.clean_nan(df).copy()
52
+ # calling the 'model_inference.model_inference.predict' function
53
+ pred = predict(df, predict_proba=False, predict_label=True)[0].strip()
54
+
55
+ else:
56
+ # executes when data validation fails
57
+ pred = "data validation failed"
58
+
59
+ except Exception as e:
60
+ # executes in case of any exception
61
+ pred = e
62
+ logger.log_to_db(level="CRITICAL", message=f"unexpected error in prediction_api: {traceback.format_exc()}")
63
+ raise
64
+ logger.log_to_db(level="INFO", message="exiting prediction_api")
65
+ return {"result": pred}
66
+
67
+
68
+ if __name__ == '__main__':
69
+ uvicorn.run(app=app, host='0.0.0.0', port=5001, workers=4)
api_requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ scikit-learn==1.0.2
2
+ xgboost==1.6.2
3
+ fastapi==0.85.2
4
+ pydantic==1.10.2
5
+ uvicorn==0.19.0
6
+ pymongo==4.3.3
7
+ pandas==1.3.5
8
+ cloudpickle==2.2.0
9
+ hyperopt==0.2.7
10
+ imblearn==0.0
clustering/__init__.py ADDED
File without changes
clustering/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (162 Bytes). View file
 
clustering/__pycache__/clustering.cpython-37.pyc ADDED
Binary file (3.03 kB). View file
 
clustering/clustering.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.cluster import KMeans
4
+ from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
5
+ from sklearn.metrics import silhouette_score
6
+
7
+
8
+ class AutoCluster:
9
+ """
10
+ Returns clusters by automatically evaluating 'k' using silhouette_score.
11
+ 'k' range = [min_cluster, max_cluster]
12
+ """
13
+ def __init__(self, min_cluster: int = 2, max_cluster: int = 10, random_state: int = 42):
14
+ self.__scaler = None
15
+ self.__ohe = None
16
+ self.k = None
17
+ self.min_cluster = min_cluster
18
+ self.max_cluster = max_cluster
19
+ self.kmeans_model = None
20
+ self.random_state = random_state
21
+
22
+ def __fit_scaler(self, x: pd.DataFrame):
23
+ x = x.copy()
24
+ self.__scaler = MinMaxScaler()
25
+ self.__scaler.fit(x)
26
+
27
+ def __find_best_k(self, x: pd.DataFrame):
28
+ x = x.copy()
29
+ self.__fit_scaler(x)
30
+ x_scaled = self.__scaler.transform(x)
31
+ silhouette_scores = []
32
+ for k in range(self.min_cluster, self.max_cluster + 1):
33
+ kmeans = KMeans(n_clusters=k, random_state=self.random_state)
34
+ kmeans.fit(x_scaled)
35
+ labels = kmeans.labels_
36
+ silhouette_scores.append(silhouette_score(X=x_scaled, labels=labels, random_state=self.random_state))
37
+ self.k = self.min_cluster + np.argmax(silhouette_scores)
38
+
39
+ def __fit_one_hot_encoder(self, x: pd.DataFrame):
40
+ self.__ohe = OneHotEncoder(sparse=False)
41
+ self.__ohe.fit(x)
42
+
43
+ def fit(self, x: pd.DataFrame):
44
+ x = x.copy()
45
+ self.__find_best_k(x)
46
+ self.kmeans_model = KMeans(n_clusters=self.k, random_state=self.random_state)
47
+ x_scaled = self.__scaler.transform(x)
48
+ self.kmeans_model.fit(x_scaled)
49
+ prediction_df = pd.DataFrame({'cluster': self.kmeans_model.predict(x_scaled)})
50
+ self.__fit_one_hot_encoder(prediction_df)
51
+ return self
52
+
53
+ def predict(self, x: pd.DataFrame):
54
+ x = x.copy()
55
+ x_scaled = self.__scaler.transform(x)
56
+ prediction = self.kmeans_model.predict(x_scaled)
57
+ prediction_df = pd.DataFrame({'cluster': prediction})
58
+ prediction_ohe = pd.DataFrame(self.__ohe.transform(prediction_df), columns=self.__ohe.get_feature_names_out())
59
+ return prediction_ohe, prediction
60
+
61
+ def fit_predict(self, x: pd.DataFrame):
62
+ self.fit(x)
63
+ self.predict(x)
64
+
65
+ def __repr__(self):
66
+ return f"AutoCluster(min_cluster={self.min_cluster}, max_cluster={self.max_cluster}, random_state=42)"
data_cleaning/__init__.py ADDED
File without changes
data_cleaning/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (165 Bytes). View file
 
data_cleaning/__pycache__/data_cleaning.cpython-37.pyc ADDED
Binary file (2.88 kB). View file
 
data_cleaning/data_cleaning.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import numpy as np
4
+ from logger.logger import MongoLogger
5
+
6
+
7
+ class DataCleaning:
8
+ def __init__(self):
9
+ self.logger = MongoLogger()
10
+
11
+ def clean_column_names(self, input_df: pd.DataFrame):
12
+ """
13
+ Replaces special characters in column names with underscore. Also converts
14
+ column names into lowercase
15
+ """
16
+ self.logger.log_to_db(level="INFO", message="entering data_cleaning.clean_column_names")
17
+ try:
18
+ df = input_df.copy()
19
+ clean_col_names = [re.sub(r"[\.\?\s]", "_", col_name.lower().strip()) for col_name in df.columns]
20
+ df.columns = clean_col_names
21
+ except Exception as e:
22
+ self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.clean_column_names error: {e}")
23
+ raise
24
+ self.logger.log_to_db(level="INFO", message="exiting data_cleaning.clean_column_names")
25
+ return df
26
+
27
+ def shorten_column_names(self, input_df: pd.DataFrame, max_len: int = 25):
28
+ """
29
+ Shortens the column names to a specified length
30
+ """
31
+ self.logger.log_to_db(level="INFO", message="entering data_cleaning.shorten_column_names")
32
+ try:
33
+ df = input_df.copy()
34
+ short_col_names = [col_name[:max_len] for col_name in df.columns]
35
+ df.columns = short_col_names
36
+ except Exception as e:
37
+ self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.shorten_column_names error: {e}")
38
+ raise
39
+ self.logger.log_to_db(level="INFO", message="exiting data_cleaning.shorten_column_names")
40
+ return df
41
+
42
+ def clean_nan(self, input_df: pd.DataFrame, to_replace: list = [' ?', '?', '-', '_', -1, "-1"]):
43
+ """
44
+ Replaces special characters and 0s in data with NaN
45
+ """
46
+ self.logger.log_to_db(level="INFO", message="entering data_cleaning.clean_nan")
47
+ try:
48
+ df = input_df.copy()
49
+ df.replace(to_replace=to_replace, value=np.nan, inplace=True, regex=False)
50
+ except Exception as e:
51
+ self.logger.log_to_db(level="CRITICAL", message=f"unexpected data_cleaning.clean_nan error: {e}")
52
+ raise
53
+ self.logger.log_to_db(level="INFO", message="exiting data_cleaning.clean_nan")
54
+ return df
data_preprocessing/__init__.py ADDED
File without changes
data_preprocessing/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (170 Bytes). View file
 
data_preprocessing/__pycache__/categorical_id_var_deletor.cpython-37.pyc ADDED
Binary file (2.52 kB). View file
 
data_preprocessing/__pycache__/categorical_imputer.cpython-37.pyc ADDED
Binary file (2.7 kB). View file
 
data_preprocessing/__pycache__/categorical_variance_threshold.cpython-37.pyc ADDED
Binary file (2.71 kB). View file
 
data_preprocessing/__pycache__/multicollinearity_handler.cpython-37.pyc ADDED
Binary file (2.68 kB). View file
 
data_preprocessing/__pycache__/numeric_imputer.cpython-37.pyc ADDED
Binary file (3.17 kB). View file
 
data_preprocessing/__pycache__/outlier_handler.cpython-37.pyc ADDED
Binary file (2.77 kB). View file
 
data_preprocessing/__pycache__/rare_category_encoder.cpython-37.pyc ADDED
Binary file (2.92 kB). View file
 
data_preprocessing/categorical_id_var_deletor.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.base import BaseEstimator, TransformerMixin
3
+ from logger.logger import MongoLogger
4
+
5
+
6
+ class CategoricalIdVarDeletor(BaseEstimator, TransformerMixin):
7
+ """
8
+ removes columns with % of unique cats >= threshold.
9
+ if threshold=1 i.e. 100% categories are unique. so it's removed
10
+ Doesn't Remove cols with all NAs
11
+ This considers NA as separate category
12
+ """
13
+ def __init__(self, threshold: float = 1):
14
+ self.threshold = threshold
15
+ self.selected_features = []
16
+
17
+ @staticmethod
18
+ def get_unique_cat_percent(x: pd.Series, n_rows: int):
19
+ x = x.copy()
20
+ return len(x.fillna("!@#$%This value is missing^&*(").unique()) / n_rows
21
+
22
+ def fit(self, x: pd.DataFrame):
23
+ logger = MongoLogger()
24
+ try:
25
+ logger.log_to_db(level="INFO", message="entering categorical_id_var_deletor.fit")
26
+ x = x.copy()
27
+ unique_cat_percent = x.apply(self.get_unique_cat_percent, n_rows=x.shape[0])
28
+ self.selected_features = [*unique_cat_percent[unique_cat_percent < self.threshold].index]
29
+ except Exception as e:
30
+ logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_id_var_deletor.fit error: {e}")
31
+ raise
32
+ logger.log_to_db(level="INFO", message="exiting categorical_id_var_deletor.fit")
33
+ return self
34
+
35
+ def transform(self, x: pd.DataFrame, y=None):
36
+ logger = MongoLogger()
37
+ try:
38
+ logger.log_to_db(level="INFO", message="entering categorical_id_var_deletor.transform")
39
+ x = x.copy()
40
+ except Exception as e:
41
+ logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_id_var_deletor.transform error: {e}")
42
+ raise
43
+ logger.log_to_db(level="INFO", message="exiting categorical_id_var_deletor.transform")
44
+ return x[self.selected_features]
45
+
46
+ def get_feature_names_out(self, input_features=None):
47
+ return self.selected_features
data_preprocessing/categorical_imputer.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.base import BaseEstimator, TransformerMixin
3
+ from logger.logger import MongoLogger
4
+
5
+
6
+ class CategoricalImputer(BaseEstimator, TransformerMixin):
7
+ """
8
+ Imputes missing values in categorical features.
9
+ strategy:
10
+ - "most_frequent": imputes most frequent value (mode).
11
+ - "constant": imputes a constant named "column_name_missing".
12
+ """
13
+ def __init__(self, strategy="most_frequent"):
14
+ self.strategy = strategy
15
+ self.fill_values = None
16
+ self.feature_names = []
17
+
18
+ def find_fill_values(self, x):
19
+ if self.strategy == 'constant':
20
+ self.fill_values = [f'{column}_missing' for column in [*x.columns]]
21
+ else:
22
+ self.fill_values = [*x.apply(lambda column: [*column.value_counts().index][0])]
23
+
24
+ def fit(self, x: pd.DataFrame, y=None):
25
+ logger = MongoLogger()
26
+ try:
27
+ logger.log_to_db(level="INFO", message="entering categorical_imputer.fit")
28
+ self.feature_names = [*x.columns]
29
+ self.find_fill_values(x)
30
+ except Exception as e:
31
+ logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_imputer.fit error: {e}")
32
+ raise
33
+ logger.log_to_db(level="INFO", message="exiting categorical_imputer.fit")
34
+ return self
35
+
36
+ def transform(self, x: pd.DataFrame, y=None):
37
+ logger = MongoLogger()
38
+ try:
39
+ logger.log_to_db(level="INFO", message="entering categorical_imputer.transform")
40
+ x = x.copy()
41
+ for i, column in enumerate([*x.columns]):
42
+ x[column] = x[column].fillna(self.fill_values[i])
43
+ except Exception as e:
44
+ logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_imputer.transform error: {e}")
45
+ raise
46
+ logger.log_to_db(level="INFO", message="exiting categorical_imputer.transform")
47
+ return x
48
+
49
+ def get_feature_names_out(self, input_features=None):
50
+ return self.feature_names
data_preprocessing/categorical_variance_threshold.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.base import BaseEstimator, TransformerMixin
3
+ from logger.logger import MongoLogger
4
+
5
+
6
+ class CategoricalVarianceThreshold(BaseEstimator, TransformerMixin):
7
+ """
8
+ removes columns where top category value_count is > threshold.
9
+ Removes cols with all NAs
10
+ This considers NA as separate category
11
+ """
12
+
13
+ def __init__(self, threshold: float = 0.99):
14
+ self.threshold = threshold
15
+ self.selected_features = []
16
+
17
+ @staticmethod
18
+ def __get_max_category_proportions(x: pd.DataFrame):
19
+ x = x.copy()
20
+ return x.apply(lambda col: col.fillna("!@#$%This value is missing^&*(").value_counts(normalize=True).values[0])
21
+
22
+ def fit(self, x, y=None):
23
+ logger = MongoLogger()
24
+ try:
25
+ logger.log_to_db(level="INFO", message="entering categorical_variance_threshold.fit")
26
+ x = x.copy()
27
+ max_category_proportions = self.__get_max_category_proportions(x)
28
+ self.selected_features = [*max_category_proportions[max_category_proportions < self.threshold].index]
29
+ except Exception as e:
30
+ logger.log_to_db(level="CRITICAL", message=f"unexpected categorical_variance_threshold.fit error: {e}")
31
+ raise
32
+ logger.log_to_db(level="INFO", message="exiting categorical_variance_threshold.fit")
33
+ return self
34
+
35
+ def transform(self, x, y=None):
36
+ logger = MongoLogger()
37
+ try:
38
+ logger.log_to_db(level="INFO", message="entering categorical_variance_threshold.transform")
39
+ x = x.copy()
40
+ except Exception as e:
41
+ logger.log_to_db(level="CRITICAL",
42
+ message=f"unexpected categorical_variance_threshold.transform error: {e}")
43
+ raise
44
+ logger.log_to_db(level="INFO", message="exiting categorical_variance_threshold.transform")
45
+ return x[self.selected_features]
46
+
47
+ def get_feature_names_out(self, input_features=None):
48
+ return self.selected_features
data_preprocessing/multicollinearity_handler.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
4
+ from sklearn.base import BaseEstimator, TransformerMixin
5
+ import warnings
6
+ from logger.logger import MongoLogger
7
+ warnings.filterwarnings("ignore")
8
+
9
+
10
+ class MulticollinearityHandler(BaseEstimator, TransformerMixin):
11
+ """
12
+ Removes numeric variables having VIF > threshold
13
+ """
14
+ def __init__(self, threshold: float = 10):
15
+ self.threshold = threshold
16
+ self.selected_features = []
17
+
18
+ @staticmethod
19
+ def get_vif(x: pd.DataFrame):
20
+ x = x.copy()
21
+ vif = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])] # calculate VIF
22
+ return vif
23
+
24
+ def fit(self, x: pd.DataFrame, y=None):
25
+ logger = MongoLogger()
26
+ try:
27
+ logger.log_to_db(level="INFO", message="entering multicollinearity_handler.fit")
28
+ x = x.copy()
29
+ vif = self.get_vif(x)
30
+ while np.max(vif) > self.threshold: # running the loop while vif > threshold
31
+ max_vif_column = x.columns[np.argmax(vif)] # idx with max vif
32
+ x = x.drop(columns=[max_vif_column]).copy() # drop column with max vif one at a time
33
+ vif = self.get_vif(x)
34
+ self.selected_features = [*x.columns] # storing remainder columns i.e. cols without multicollinearity
35
+ except Exception as e:
36
+ logger.log_to_db(level="CRITICAL", message=f"unexpected multicollinearity_handler.fit error: {e}")
37
+ raise
38
+ logger.log_to_db(level="INFO", message="exiting multicollinearity_handler.fit")
39
+ return self
40
+
41
+ def transform(self, x: pd.DataFrame, y=None):
42
+ logger = MongoLogger()
43
+ try:
44
+ logger.log_to_db(level="INFO", message="entering multicollinearity_handle.transform")
45
+ x = x.copy()
46
+ except Exception as e:
47
+ logger.log_to_db(level="CRITICAL", message=f"unexpected multicollinearity_handle.transform error: {e}")
48
+ raise
49
+ logger.log_to_db(level="INFO", message="exiting multicollinearity_handle.transform")
50
+ return x[self.selected_features]
51
+
52
+ def get_feature_names_out(self, input_features=None):
53
+ return self.selected_features
data_preprocessing/numeric_imputer.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from logger.logger import MongoLogger
5
+
6
+
7
+ class NumericImputer(BaseEstimator, TransformerMixin):
8
+ """
9
+ Imputes missing values in numeric features.
10
+ strategy:
11
+ - "mean": imputes mean.
12
+ - "median": imputes median.
13
+ - "lower bound": imputes lower bound of IQR method for outlier detection.
14
+ - "upper bound": imputes upper bound of IQR method for outlier detection.
15
+ Using the bound factor, lower/upper bounds can be controlled. A higher bound_factor will result in
16
+ end-of-distribution imputation. It is equivalent to adding missing category in categorical imputation.
17
+ """
18
+ def __init__(self, strategy: str = "median", bound_factor: float = 1.5):
19
+ self.strategy = strategy
20
+ self.bound_factor = bound_factor
21
+ self.fill_values = []
22
+ self.feature_names = []
23
+
24
+ def find_bounds(self, x: pd.Series):
25
+ """
26
+ Find lower/upper bounds using IQR method for outlier detection.
27
+ """
28
+ q1 = x.quantile(0.25)
29
+ q3 = x.quantile(0.75)
30
+ iqr = q3 - q1
31
+ if self.strategy == "lower_bound":
32
+ bound = q1 - (self.bound_factor * iqr)
33
+ else:
34
+ bound = q3 + (self.bound_factor * iqr)
35
+ return bound
36
+
37
+ def find_fill_values(self, x: pd.Series):
38
+ if self.strategy == 'mean':
39
+ self.fill_values.append(np.mean(x.dropna()))
40
+ elif self.strategy == 'median':
41
+ self.fill_values.append(np.median(x.dropna()))
42
+ else:
43
+ self.fill_values.append(self.find_bounds(x.dropna()))
44
+
45
+ def fit(self, x: pd.DataFrame, y=None):
46
+ logger = MongoLogger()
47
+ try:
48
+ logger.log_to_db(level="INFO", message="entering numeric_imputer.fit")
49
+ self.feature_names = [*x.columns]
50
+ x.apply(self.find_fill_values)
51
+ except Exception as e:
52
+ logger.log_to_db(level="CRITICAL", message=f"unexpected numeric_imputer.fit error: {e}")
53
+ raise
54
+ logger.log_to_db(level="INFO", message="exiting numeric_imputer.fit")
55
+ return self
56
+
57
+ def transform(self, x: pd.DataFrame, y=None):
58
+ logger = MongoLogger()
59
+ try:
60
+ logger.log_to_db(level="INFO", message="entering numeric_imputer.transform")
61
+ x = x.copy()
62
+ for i, column in enumerate([*x.columns]):
63
+ x[column] = x[column].fillna(self.fill_values[i])
64
+ except Exception as e:
65
+ logger.log_to_db(level="CRITICAL", message=f"unexpected numeric_imputer.transform error: {e}")
66
+ raise
67
+ logger.log_to_db(level="INFO", message="exiting numeric_imputer.transform")
68
+ return x
69
+
70
+ def get_feature_names_out(self, input_features=None):
71
+ return self.feature_names
data_preprocessing/outlier_handler.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from logger.logger import MongoLogger
5
+
6
+
7
+ class OutlierHandler(BaseEstimator, TransformerMixin):
8
+ """
9
+ This transform detects outliers using IQR method. Then the outliers
10
+ are either replaced with NaN or with lower or upper bounds computed
11
+ using IQR method i.e. they are winsorized.
12
+ """
13
+ def __init__(self, method: str = 'winsorize', factor: float = 1.5):
14
+ self.method = method
15
+ self.factor = factor
16
+ self.lower_bounds = []
17
+ self.upper_bounds = []
18
+ self.feature_names = []
19
+
20
+ def detect_bounds(self, x: pd.Series):
21
+ """
22
+ Method to detect the lower and upper bounds using IQR method
23
+ """
24
+ x = x.copy()
25
+ q1 = x.quantile(0.25)
26
+ q3 = x.quantile(0.75)
27
+ iqr = q3 - q1
28
+ lower_bound = q1 - (self.factor * iqr)
29
+ upper_bound = q3 + (self.factor * iqr)
30
+ self.lower_bounds.append(lower_bound)
31
+ self.upper_bounds.append(upper_bound)
32
+
33
+ def fit(self, x: pd.DataFrame, y=None):
34
+ logger = MongoLogger()
35
+ try:
36
+ logger.log_to_db(level="INFO", message="entering outlier_handler.fit")
37
+ x = x.copy()
38
+ self.feature_names = [*x.columns]
39
+ x.apply(self.detect_bounds)
40
+ except Exception as e:
41
+ logger.log_to_db(level="CRITICAL", message=f"unexpected outlier_handler.fit error: {e}")
42
+ raise
43
+ logger.log_to_db(level="INFO", message="exiting outlier_handler.fit")
44
+ return self
45
+
46
+ def transform(self, x: pd.DataFrame, y=None):
47
+ logger = MongoLogger()
48
+ try:
49
+ logger.log_to_db(level="INFO", message="entering outlier_handler.transform")
50
+ x = x.copy()
51
+ for i, column in enumerate(x.columns):
52
+ lower_bound = self.lower_bounds[i]
53
+ upper_bound = self.upper_bounds[i]
54
+ lower_repl = np.nan
55
+ upper_repl = np.nan
56
+ if self.method == 'winsorize':
57
+ lower_repl = lower_bound
58
+ upper_repl = upper_bound
59
+ x.loc[(x[column] < lower_bound), column] = lower_repl
60
+ x.loc[(x[column] > upper_bound), column] = upper_repl
61
+ except Exception as e:
62
+ logger.log_to_db(level="CRITICAL", message=f"unexpected outlier_handler.transform error: {e}")
63
+ raise
64
+ logger.log_to_db(level="INFO", message="exiting outlier_handler.transform")
65
+ return x
66
+
67
+ def get_feature_names_out(self, input_features=None):
68
+ return self.feature_names
data_preprocessing/rare_category_encoder.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.base import BaseEstimator, TransformerMixin
3
+ from logger.logger import MongoLogger
4
+
5
+
6
+ class RareCategoryEncoder(BaseEstimator, TransformerMixin):
7
+ """
8
+ replaces rare categories with rare_category.
9
+ Any unknown category in test set is also replaced with rare_category.
10
+ stores the frequent categories.
11
+ NAs are ignored (as a col with single value 'a' and remaining NAs). 'a' can't be considered rare
12
+ as it is the only value available, it can't be rare.
13
+ This method doesn't touch NAs. They are left as they are.
14
+ """
15
+ def __init__(self, threshold: float = 0.05, replace_value: str = 'rare_category'):
16
+ self.threshold = threshold
17
+ self.replace_value = replace_value
18
+ self.feature_names = []
19
+ self.frequent_cat_list = []
20
+
21
+ def __frequent_category_detector(self, x: pd.Series, y=None):
22
+ x = x.copy()
23
+ val_counts = x.value_counts(normalize=True)
24
+ # frequent categories in a column are the ones whose frequency > threshold
25
+ frequent_cats = [*val_counts[val_counts > self.threshold].index]
26
+ self.frequent_cat_list.append(frequent_cats)
27
+
28
+ def fit(self, x: pd.DataFrame, y=None):
29
+ logger = MongoLogger()
30
+ try:
31
+ logger.log_to_db(level="INFO", message="entering rare_category_encoder.fit")
32
+ x = x.copy()
33
+ self.feature_names = [*x.columns]
34
+ x.apply(self.__frequent_category_detector)
35
+ except Exception as e:
36
+ logger.log_to_db(level="CRITICAL", message=f"unexpected rare_category_encoder.fit error: {e}")
37
+ raise
38
+ logger.log_to_db(level="INFO", message="exiting rare_category_encoder.fit")
39
+ return self
40
+
41
+ def transform(self, x: pd.DataFrame, y=None):
42
+ logger = MongoLogger()
43
+ try:
44
+ logger.log_to_db(level="INFO", message="entering rare_category_encoder.transform")
45
+ x = x.copy()
46
+ for i in range(x.shape[1]):
47
+ x_ser = x.iloc[:, i].copy()
48
+ # replacing categories in each column, not in frequent list and not NANs with replace_value
49
+ x_ser[(x_ser.isin(self.frequent_cat_list[i]) == False) &
50
+ (x_ser.isna() == False) &
51
+ (x_ser.isnull() == False)] = self.replace_value
52
+ x.iloc[:, i] = x_ser
53
+ except Exception as e:
54
+ logger.log_to_db(level="CRITICAL", message=f"unexpected rare_category_encoder.transform error: {e}")
55
+ raise
56
+ logger.log_to_db(level="INFO", message="exiting rare_category_encoder.transform")
57
+ return x
58
+
59
+ def get_feature_names_out(self, input_features=None):
60
+ return self.feature_names
data_validation/__init__.py ADDED
File without changes
data_validation/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (167 Bytes). View file
 
data_validation/__pycache__/data_validation.cpython-37.pyc ADDED
Binary file (2.27 kB). View file
 
data_validation/data_validation.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.json_parser import JSONParser
2
+ import pandas as pd
3
+ import os
4
+ from logger.logger import MongoLogger
5
+
6
+
7
+ class DataValidation:
8
+ """
9
+ - Class to validate input data as per the data dictionary. It validates the column count,
10
+ column data types and column names.
11
+ - Returns 1 if data is valid else 0
12
+ - dataset: str
13
+ - train: for train set
14
+ - test: for test set
15
+ - prediction: for single sample inference / batch inference
16
+ """
17
+ def __init__(self, input_df: pd.DataFrame, dataset: str = "train"):
18
+ self.input_df = input_df
19
+ self.dataset = dataset
20
+
21
+ def validate_data(self):
22
+ logger = MongoLogger()
23
+ try:
24
+ logger.log_to_db(level="INFO", message="Entering data_validation")
25
+ status = 1
26
+ # parsing input data specification JSON
27
+ json_parser = JSONParser(os.path.join('.', "data_validation", 'input_data_specs.json'))
28
+ input_data_specs_dict = json_parser.parse_json()
29
+ # column specs for train and test data
30
+ column_count_key = 'train_test_column_count'
31
+ column_name_key = 'train_test_column_names'
32
+ column_types_key = 'train_test_column_dtypes'
33
+ # column specs for prediction data. Prediction data shouldn't have target (salary) column
34
+ if self.dataset == "prediction":
35
+ column_count_key = 'prediction_column_count'
36
+ column_name_key = 'prediction_column_names'
37
+ column_types_key = 'prediction_column_dtypes'
38
+
39
+ n_cols = input_data_specs_dict[column_count_key]
40
+ col_names = input_data_specs_dict[column_name_key]
41
+ col_dtypes = input_data_specs_dict[column_types_key]
42
+
43
+ if len(self.input_df.columns) != n_cols:
44
+ status = 0
45
+ logger.log_to_db(level="CRITICAL",
46
+ message=f"{self.dataset} data_validation failed: column count doesn't match")
47
+
48
+ if col_names != [*self.input_df.columns]:
49
+ status = 0
50
+ logger.log_to_db(level="CRITICAL",
51
+ message=f"{self.dataset} data_validation failed: column names don't match")
52
+
53
+ if col_dtypes != self.input_df.dtypes.tolist():
54
+ status = 0
55
+ logger.log_to_db(level="CRITICAL",
56
+ message=f"{self.dataset} data_validation failed: column dtypes don't match")
57
+ except Exception as e:
58
+ logger.log_to_db(level="CRITICAL", message=f"unexpected data_validation error: {e}")
59
+ raise
60
+
61
+ logger.log_to_db(level="INFO", message="exiting data_validation")
62
+ return status
data_validation/input_data_specs.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_test_column_count": 15,
3
+ "train_test_column_names": [
4
+ "age",
5
+ "workclass",
6
+ "fnlwgt",
7
+ "education",
8
+ "education_num",
9
+ "marital_status",
10
+ "occupation",
11
+ "relationship",
12
+ "race",
13
+ "sex",
14
+ "capital_gain",
15
+ "capital_loss",
16
+ "hours_per_week",
17
+ "country",
18
+ "salary"
19
+
20
+ ],
21
+ "train_test_column_dtypes": [
22
+ "int64",
23
+ "O",
24
+ "int64",
25
+ "O",
26
+ "int64",
27
+ "O",
28
+ "O",
29
+ "O",
30
+ "O",
31
+ "O",
32
+ "int64",
33
+ "int64",
34
+ "int64",
35
+ "O",
36
+ "O"
37
+ ],
38
+ "prediction_column_count": 14,
39
+ "prediction_column_names": [
40
+ "age",
41
+ "workclass",
42
+ "fnlwgt",
43
+ "education",
44
+ "education_num",
45
+ "marital_status",
46
+ "occupation",
47
+ "relationship",
48
+ "race",
49
+ "sex",
50
+ "capital_gain",
51
+ "capital_loss",
52
+ "hours_per_week",
53
+ "country"
54
+ ],
55
+ "prediction_column_dtypes": [
56
+ "int64",
57
+ "O",
58
+ "int64",
59
+ "O",
60
+ "int64",
61
+ "O",
62
+ "O",
63
+ "O",
64
+ "O",
65
+ "O",
66
+ "int64",
67
+ "int64",
68
+ "int64",
69
+ "O"
70
+ ]
71
+ }
feature_construction/__init__.py ADDED
File without changes
feature_construction/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (172 Bytes). View file
 
feature_construction/__pycache__/feature_construction.cpython-37.pyc ADDED
Binary file (1.71 kB). View file
 
feature_construction/feature_construction.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from logger.logger import MongoLogger
3
+
4
+
5
+ class FeatureConstructor:
6
+ """
7
+ Adds new engineered features to input data
8
+ """
9
+ @staticmethod
10
+ def add_features(x: pd.DataFrame):
11
+ logger = MongoLogger()
12
+ try:
13
+ logger.log_to_db(level="INFO", message="entering add_features")
14
+ x = x.copy()
15
+
16
+ education_group = {' 5th-6th': 'middle_school',
17
+ ' 7th-8th': 'middle_school',
18
+ ' 9th': 'middle_school',
19
+ ' 10th': 'high_school',
20
+ ' 11th': 'high_school',
21
+ ' 12th': 'high_school',
22
+ ' HS-grad': 'hs_grad',
23
+ ' Prof-school': 'high_school',
24
+ ' Some-college': 'college',
25
+ ' Masters': 'college',
26
+ ' Bachelors': 'college',
27
+ ' 1st-4th': 'primary_school',
28
+ ' Preschool': 'primary_school',
29
+ ' Assoc-voc': 'college',
30
+ ' Assoc-acdm': 'college',
31
+ ' Doctorate': 'doctorate'}
32
+ x['education_group'] = x['education'].map(education_group)
33
+
34
+ # workclass group has ver low mutual info score so removing it
35
+
36
+ # workclass_group = {' Federal-gov': 'government',
37
+ # ' Local-gov': 'government',
38
+ # ' State-gov': 'government',
39
+ # ' Private': 'private',
40
+ # ' Self-emp-inc': 'self_emp',
41
+ # ' Self-emp-not-inc': 'self_emp',
42
+ # ' Never-worked': 'no_work',
43
+ # ' Without-pay': 'no_work'}
44
+ # x['workclass_group'] = x['workclass'].map(workclass_group)
45
+
46
+ is_single = {' Divorced': 1,
47
+ ' Married-spouse-absent': 1,
48
+ ' Never-married': 1,
49
+ ' Separated': 1,
50
+ ' Widowed': 1,
51
+ ' Married-AF-spouse': 0,
52
+ ' Married-civ-spouse': 0}
53
+ x['is_single'] = x['marital_status'].map(is_single)
54
+
55
+ x['relationship_marital'] = x['relationship'] + x['marital_status']
56
+
57
+ # x.loc[(x['capital_gain'] > 5000), 'capital_gain'] = 5000
58
+
59
+ x['has_capital_gain'] = (x['capital_gain'] > 0).astype("int")
60
+
61
+ # x['has_capital_loss'] = (x['capital_loss'] > 0).astype("int")
62
+
63
+ x['age_of_first_edu'] = x['age'] - x['education_num']
64
+
65
+ except Exception as e:
66
+ logger.log_to_db(level="CRITICAL", message=f"unexpected add_features error: {e}")
67
+ raise
68
+ logger.log_to_db(level="INFO", message="exiting add_features")
69
+
70
+ return x
feature_selection/__init__.py ADDED
File without changes
feature_selection/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (169 Bytes). View file
 
feature_selection/__pycache__/feature_selection.cpython-37.pyc ADDED
Binary file (5.42 kB). View file
 
feature_selection/feature_selection.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from xgboost import XGBClassifier
6
+ from sklearn.metrics import roc_auc_score
7
+ from sklearn.base import BaseEstimator, TransformerMixin
8
+ from sklearn.model_selection import StratifiedKFold
9
+ from utils.find_class_weights import find_class_weights
10
+ from sklearn.utils import class_weight
11
+ from logger.logger import MongoLogger
12
+
13
+
14
+ class FeatureSelector(BaseEstimator, TransformerMixin):
15
+ """
16
+ - Feature selector selects n_features by training RandomForestClassifier and XGBClassifier.
17
+ - It uses hyperopt hyperparameter tuning to choose the best model from the two.
18
+ - It uses feature importance of the best model to select features.
19
+ """
20
+ def __init__(self, n_features: int, n_trials: int = 10, cv_splits: int = 2):
21
+ self.n_features = int(n_features)
22
+ self.n_trials = n_trials
23
+ self.cv_splits = cv_splits
24
+ self.selected_features = []
25
+ self.feature_importance = None
26
+
27
+ def __find_best_model(self, x: pd.DataFrame, y: pd.Series):
28
+ x = x.reset_index(drop=True).copy()
29
+ y = y.reset_index(drop=True).copy()
30
+ classifier_params = ['random_forest', 'xgboost']
31
+ max_depth_params = [3, 4, 5, 6, 7, 8, 9, 10]
32
+ n_estimators_params = [50, 100, 200, 300, 500]
33
+ eta_params = [0.1, 0.3, 0.01, 0.001, 0.0001, 1]
34
+ search_space = {'classifier': hp.choice('classifier', [
35
+ {
36
+ 'type': classifier_params[0],
37
+ 'max_depth': hp.choice('rf_max_depth', max_depth_params),
38
+ 'n_estimators': hp.choice('rf_n_estimators', n_estimators_params)
39
+ },
40
+ {
41
+ 'type': classifier_params[1],
42
+ 'max_depth': hp.choice('xgb_max_depth', max_depth_params),
43
+ 'n_estimators': hp.choice('xgb_n_estimators', n_estimators_params),
44
+ 'eta': hp.choice('xgb_eta', eta_params)
45
+ }
46
+ ])}
47
+
48
+ def objective(params):
49
+ classifier = None
50
+ max_depth = params['classifier']['max_depth']
51
+ n_estimators = params['classifier']['n_estimators']
52
+ train_scores = []
53
+ val_scores = []
54
+ skfold = StratifiedKFold(n_splits=self.cv_splits)
55
+ for train_idx, test_idx in skfold.split(x, y):
56
+ if params['classifier']['type'] == classifier_params[0]: # rf
57
+ class_weight_params = find_class_weights(y=y[train_idx])
58
+ classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=42,
59
+ class_weight=class_weight_params)
60
+ classifier.fit(x.iloc[train_idx], y[train_idx])
61
+
62
+ if params['classifier']['type'] == classifier_params[1]: # xgboost
63
+ eta = params['classifier']['eta']
64
+ sample_weight_params = class_weight.compute_sample_weight(class_weight="balanced", y=y[train_idx])
65
+ classifier = XGBClassifier(n_estimators=n_estimators, eta=eta,
66
+ max_depth=max_depth, random_state=42, verbosity=0)
67
+ classifier.fit(x.iloc[train_idx], y[train_idx], sample_weight=sample_weight_params)
68
+
69
+ train_scores.append(roc_auc_score(y[train_idx],
70
+ classifier.predict_proba(x.iloc[train_idx])[:, 1]))
71
+ val_scores.append(roc_auc_score(y[test_idx],
72
+ classifier.predict_proba(x.iloc[test_idx])[:, 1]))
73
+
74
+ avg_train_score = np.mean(train_scores)
75
+ avg_val_score = np.mean(val_scores)
76
+ return {'loss': -avg_val_score, 'train_score': avg_train_score, 'val_score': avg_val_score,
77
+ 'status': STATUS_OK}
78
+
79
+ model_trials = Trials()
80
+ model_best = fmin(
81
+ fn=objective,
82
+ space=search_space,
83
+ algo=tpe.suggest,
84
+ max_evals=self.n_trials,
85
+ catch_eval_exceptions=False,
86
+ verbose=False,
87
+ trials=model_trials
88
+ )
89
+
90
+ best_classifier_name = classifier_params[model_best['classifier']]
91
+ best_classifier_model = None
92
+ if best_classifier_name == classifier_params[0]: # rf
93
+ class_weight_params = find_class_weights(y=y)
94
+ best_max_depth = max_depth_params[model_best['rf_max_depth']]
95
+ best_n_estimators = n_estimators_params[model_best['rf_n_estimators']]
96
+ best_classifier_model = RandomForestClassifier(max_depth=best_max_depth,
97
+ n_estimators=best_n_estimators,
98
+ random_state=42,
99
+ class_weight=class_weight_params)
100
+
101
+ if best_classifier_name == classifier_params[1]: # xgboost
102
+ best_n_estimators = n_estimators_params[model_best['xgb_n_estimators']]
103
+ best_max_depth = max_depth_params[model_best['xgb_max_depth']]
104
+ best_eta = eta_params[model_best['xgb_eta']]
105
+ best_classifier_model = XGBClassifier(n_estimators=best_n_estimators, eta=best_eta,
106
+ max_depth=best_max_depth, random_state=42, verbosity=0)
107
+
108
+ return best_classifier_model, model_trials.best_trial
109
+
110
+ def fit(self, x: pd.DataFrame, y: pd.Series):
111
+ logger = MongoLogger()
112
+ try:
113
+ logger.log_to_db(level="INFO", message="entering feature_selection.fit")
114
+ x = x.reset_index(drop=True).copy()
115
+ y = y.reset_index(drop=True).copy()
116
+ best_classifier, _ = self.__find_best_model(x, y)
117
+ if isinstance(best_classifier, XGBClassifier):
118
+ sample_weight_params = class_weight.compute_sample_weight(class_weight="balanced", y=y)
119
+ _ = best_classifier.fit(x, y, sample_weight=sample_weight_params)
120
+ else:
121
+ _ = best_classifier.fit(x, y)
122
+ self.feature_importance = pd.Series(best_classifier.feature_importances_,
123
+ index=x.columns).sort_values(ascending=False)
124
+ self.selected_features = [*self.feature_importance[: self.n_features].index]
125
+ except Exception as e:
126
+ logger.log_to_db(level="CRITICAL", message=f"unexpected feature_selection.fit error: {e}")
127
+ raise
128
+ logger.log_to_db(level="INFO", message="exiting feature_selection.fit")
129
+ return self
130
+
131
+ def transform(self, x: pd.DataFrame, y=None):
132
+ logger = MongoLogger()
133
+ try:
134
+ logger.log_to_db(level="INFO", message="entering feature_selection.transform")
135
+ x = x.reset_index(drop=True).copy()
136
+ except Exception as e:
137
+ logger.log_to_db(level="CRITICAL", message=f"unexpected feature_selection.transform error: {e}")
138
+ raise
139
+ logger.log_to_db(level="INFO", message="exiting feature_selection.transform")
140
+ return x[self.selected_features]
141
+
142
+ def get_feature_names_out(self, input_features=None):
143
+ return self.selected_features
logger/__init__.py ADDED
File without changes
logger/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (158 Bytes). View file
 
logger/__pycache__/logger.cpython-37.pyc ADDED
Binary file (1.95 kB). View file
 
logger/logger.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymongo
2
+ from datetime import datetime
3
+ from utils.set_log_secrets_env import set_log_secrets_env
4
+ import os
5
+
6
+
7
+ class MongoLogger:
8
+ """
9
+ Custom logger that inserts logs into MongoDB
10
+ """
11
+ def __init__(self):
12
+ set_log_secrets_env()
13
+ self.url = os.getenv('LOGGER_URL')
14
+ self.database = "logger_db"
15
+ self.collection = "logger"
16
+ self.__client = None
17
+ self.__error = 0
18
+
19
+ def __connect(self):
20
+ try:
21
+ self.__client = pymongo.MongoClient(self.url)
22
+ _ = self.__client.list_database_names()
23
+ except Exception as conn_exception:
24
+ self.__error = 1
25
+ self.__client = None
26
+ raise
27
+
28
+ def __insert(self, json_log):
29
+ try:
30
+ db = self.__client[self.database]
31
+ coll = db[self.collection]
32
+ coll.insert_one(json_log)
33
+ except Exception as insert_err:
34
+ self.__error = 1
35
+ raise
36
+
37
+ def __close_connection(self):
38
+ if self.__client is not None:
39
+ self.__client.close()
40
+ self.__client = None
41
+
42
+ def log_to_db(self, level: str, message: str):
43
+ if self.url is not None:
44
+ if self.__error == 0:
45
+ self.__connect()
46
+ if self.__error == 0:
47
+ json_log = {"time": str(datetime.now()), "level": level, "message": message}
48
+ self.__insert(json_log)
49
+ if self.__client is not None:
50
+ self.__close_connection()
model_inference/__init__.py ADDED
File without changes
model_inference/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (167 Bytes). View file
 
model_inference/__pycache__/model_inference.cpython-37.pyc ADDED
Binary file (2.02 kB). View file
 
model_inference/model_inference.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.load_model import load_model
2
+ from feature_construction.feature_construction import FeatureConstructor
3
+ import pandas as pd
4
+ from logger.logger import MongoLogger
5
+
6
+
7
+ def predict(x: pd.DataFrame, predict_proba: bool = True,
8
+ use_deployed_model: bool = True, model_file_name: str = None, predict_label: bool = False):
9
+ prediction = None
10
+ logger = MongoLogger()
11
+ try:
12
+ logger.log_to_db(level="INFO", message="entering model_inference.predict")
13
+
14
+ loaded_model = load_model(load_deployed_model=use_deployed_model, model_file_name=model_file_name)
15
+
16
+ if loaded_model is not None:
17
+ (label_encoder, cat_var_id_transform, cat_var_threshold_transform,
18
+ num_var_threshold_transform, rare_cat_transform, outlier_transform,
19
+ cat_missing_imputer, num_missing_imputer, one_hot_encoder,
20
+ ordinal_encoder, minmax_scaler, clusterer, multicoll_transform,
21
+ feature_selector, class_weight_flag, smote_transform, classifier, best_class_threshold) = loaded_model
22
+
23
+ feature_constructor = FeatureConstructor()
24
+ x = feature_constructor.add_features(x).copy()
25
+
26
+ if cat_var_id_transform is not None:
27
+ x = cat_var_id_transform.transform(x).copy()
28
+
29
+ if cat_var_threshold_transform is not None:
30
+ x = cat_var_threshold_transform.transform(x).copy()
31
+
32
+ if num_var_threshold_transform is not None:
33
+ x = num_var_threshold_transform.transform(x).copy()
34
+
35
+ if rare_cat_transform is not None:
36
+ x = rare_cat_transform.transform(x).copy()
37
+
38
+ if outlier_transform is not None:
39
+ x = outlier_transform.transform(x).copy()
40
+
41
+ if cat_missing_imputer is not None:
42
+ x = cat_missing_imputer.transform(x).copy()
43
+
44
+ if num_missing_imputer is not None:
45
+ x = num_missing_imputer.transform(x).copy()
46
+
47
+ if one_hot_encoder is not None:
48
+ x = one_hot_encoder.transform(x).copy()
49
+
50
+ if ordinal_encoder is not None:
51
+ x = ordinal_encoder.transform(x).copy()
52
+
53
+ if minmax_scaler is not None:
54
+ x = minmax_scaler.transform(x).copy()
55
+
56
+ if clusterer is not None:
57
+ cluster_ohe, _ = clusterer.predict(x)
58
+ x = pd.concat([x, cluster_ohe], axis=1)
59
+
60
+ if multicoll_transform is not None:
61
+ x = multicoll_transform.transform(x).copy()
62
+
63
+ if feature_selector is not None:
64
+ x = feature_selector.transform(x).copy()
65
+
66
+ if predict_proba:
67
+ prediction = classifier.predict_proba(x)[:, 1]
68
+ else:
69
+ prediction = (classifier.predict_proba(x)[:, 1] >= best_class_threshold).astype('int')
70
+ if predict_label:
71
+ prediction = label_encoder.inverse_transform(prediction)
72
+
73
+ except Exception as e:
74
+ logger.log_to_db(level="CRITICAL", message=f"unexpected model_inference.predict error: {e}")
75
+ raise
76
+ logger.log_to_db(level="INFO", message="exiting model_inference.predict")
77
+
78
+ return prediction
models/deployed_model.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"deployed_model": "model_cold_2022_11_22"}
models/model_cold_2022_11_22/feature_importances.csv ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature,importance
2
+ is_single,0.46307153
3
+ education_num,0.064920954
4
+ relationship_ Own-child,0.05042011
5
+ capital_gain,0.04799759
6
+ occupation_ Exec-managerial,0.036840715
7
+ marital_status_ Married-civ-spouse,0.029596165
8
+ occupation_ Prof-specialty,0.02784773
9
+ occupation_ Other-service,0.027295496
10
+ age,0.02378357
11
+ capital_loss,0.01889127
12
+ sex_ Female,0.017749619
13
+ relationship_rare_category,0.015518844
14
+ hours_per_week,0.015093696
15
+ occupation_ Adm-clerical,0.0121984985
16
+ race_ White,0.011818247
17
+ relationship_ Not-in-family,0.011008147
18
+ marital_status_ Never-married,0.010437393
19
+ occupation_rare_category,0.009737839
20
+ workclass_ Self-emp-not-inc,0.009453006
21
+ workclass_rare_category,0.00917275
22
+ country_ United-States,0.008190496
23
+ race_ Black,0.0076376186
24
+ workclass_ Private,0.007357159
25
+ education_group_high_school,0.0066947634
26
+ relationship_marital_ Own-child Never-married,0.0062995595
27
+ age_of_first_edu,0.00590806
28
+ education,0.0057953694
29
+ education_group_college,0.0057467762
30
+ race_rare_category,0.0057038493
31
+ fnlwgt,0.005448137
32
+ occupation_ Sales,0.005127732
33
+ education_group_rare_category,0.0049574347
34
+ occupation_ Craft-repair,0.0046369503
35
+ relationship_marital_ Husband Married-civ-spouse,0.0040548774
36
+ relationship_marital_rare_category,0.0035880853