| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.pipeline import Pipeline | |
| from scripts.download_data import download_data | |
| from sklearn.metrics import f1_score | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.preprocessing import PowerTransformer | |
| from sklearn.preprocessing import OneHotEncoder | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.preprocessing import QuantileTransformer | |
| import pandas as pd | |
| def calculate_metric(model): | |
| _, test_set = download_data() | |
| X_test, y_test = test_set.drop(columns=['cardio']), test_set['cardio'] | |
| y_pred = model.predict(X_test) | |
| f1 = f1_score(y_test, y_pred, pos_label='positive') | |
| return f1 | |
| def model_training(): | |
| train_set, _ = download_data() | |
| X_train, y_train = train_set.drop(columns=['cardio']), train_set['cardio'] | |
| num_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',] | |
| cat_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'] | |
| num_pipe = Pipeline([ | |
| ('qt', QuantileTransformer(output_distribution="normal")), | |
| ('scaler', StandardScaler()), | |
| ('power', PowerTransformer()), | |
| ]) | |
| cat_pipe = Pipeline([ | |
| ('encoder', OneHotEncoder(handle_unknown='ignore')) | |
| ]) | |
| preprocessors_all = ColumnTransformer(transformers=[ | |
| ('num_p', num_pipe, num_columns), | |
| ('cat_p', cat_pipe, cat_columns), | |
| ]) | |
| pipe_all = Pipeline([ | |
| ('preprocessors', preprocessors_all), | |
| ('model', RandomForestClassifier(n_estimators=200, | |
| criterion = "gini", | |
| min_samples_split=15, | |
| max_depth=15, | |
| oob_score=True) | |
| ) | |
| ]) | |
| pipe_all.fit(X_train, y_train) | |
| return pipe_all, calculate_metric(pipe_all) | |