kmckee95 commited on
Commit
08e38f9
1 Parent(s): 1c4f879

Add files via upload

Browse files
Files changed (3) hide show
  1. full_pipeline.pickle +0 -0
  2. heartdisease.py +144 -0
  3. rfc.pickle +0 -0
full_pipeline.pickle ADDED
Binary file (52.1 kB). View file
 
heartdisease.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import the library
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
6
+ from sklearn.preprocessing import StandardScaler
7
+
8
+ from sklearn.impute import KNNImputer
9
+ from sklearn.pipeline import Pipeline
10
+ from sklearn.compose import ColumnTransformer
11
+
12
+
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.ensemble import RandomForestClassifier
15
+ from sklearn.ensemble import GradientBoostingClassifier
16
+
17
+ #libraries for model evaluation
18
+ import matplotlib.pyplot as plt
19
+ from sklearn.metrics import accuracy_score
20
+ from sklearn.metrics import plot_confusion_matrix
21
+ from sklearn.metrics import classification_report
22
+
23
+ import warnings
24
+ warnings.filterwarnings('ignore')
25
+
26
+ # read the dataset
27
+ df = pd.read_csv('heart.csv')
28
+
29
+ # get categorical columns
30
+ categorical_cols= df.select_dtypes(include=['object'])
31
+
32
+ # get count of unique values for categorical columns
33
+ for cols in categorical_cols.columns:
34
+ print(cols,':', len(categorical_cols[cols].unique()),'labels')
35
+
36
+ # categorical columns
37
+ cat_col = categorical_cols.columns
38
+
39
+ # numerical column
40
+ num_col = ['Age','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak']
41
+
42
+ # define X and y
43
+ X = df.drop(['HeartDisease'],axis=1)
44
+ y = df['HeartDisease']
45
+
46
+ # create a pipeline for preprocessing the dataset
47
+
48
+ num_pipeline = Pipeline([
49
+ ('imputer', KNNImputer(n_neighbors=5)),
50
+ ('std_scaler', StandardScaler()),
51
+ ])
52
+
53
+ num_attribs = num_col
54
+ cat_attribs = cat_col
55
+
56
+ # apply transformation to the numerical and categorical columns
57
+ full_pipeline = ColumnTransformer([
58
+ ("num", num_pipeline, num_attribs),
59
+ ("cat", OneHotEncoder(), cat_attribs),
60
+ ])
61
+
62
+ X = full_pipeline.fit_transform(X)
63
+
64
+ # save preprocessed data
65
+ temp_df = pd.DataFrame(X)
66
+ temp_df.to_csv('processed_data.csv')
67
+
68
+ # Splitting the dataset into the Training set and Test set
69
+ from sklearn.model_selection import train_test_split
70
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
71
+
72
+ # count plot for number of heart disease(1)/No heart disease(0)
73
+ import seaborn as sns
74
+ sns.countplot(y_train,palette='OrRd')
75
+
76
+ # create a fresh model based on tuned parameters
77
+ rfc1=RandomForestClassifier(random_state=42, max_features='sqrt', n_estimators= 50, max_depth=7, criterion='gini')
78
+
79
+ rfc1.fit(X_train, y_train)
80
+
81
+ # Predicting the Test set results
82
+ y_pred = rfc1.predict(X_test)
83
+ print('Random forest accuracy_score:',accuracy_score(y_test,y_pred))
84
+
85
+ # Save the Model
86
+
87
+ import pickle
88
+
89
+ # save the random forest model for future use
90
+ pickle.dump(rfc1, open('rfc.pickle', 'wb'))
91
+
92
+ # save the preprocessing pipeline
93
+ pickle.dump(full_pipeline, open('full_pipeline.pickle', 'wb'))
94
+
95
+ # Load the Models for future use
96
+
97
+ rfc_saved = pickle.load(open('rfc.pickle','rb'))
98
+ full_pipeline_saved = pickle.load(open('full_pipeline.pickle','rb'))
99
+
100
+ # Visualization
101
+
102
+ target = df['HeartDisease'].replace([0,1],['Low','High'])
103
+
104
+ data = pd.crosstab(index=df['Sex'],
105
+ columns=target)
106
+
107
+ data.plot(kind='bar',stacked=True)
108
+ plt.show()
109
+
110
+ plt.figure(figsize=(10,5))
111
+ bins=[0,30,50,80]
112
+ sns.countplot(x=pd.cut(df.Age,bins=bins),hue=target,color='r')
113
+ plt.show()
114
+
115
+ plt.figure(figsize=(10,5))
116
+ sns.countplot(x=target,hue=df.ChestPainType)
117
+ plt.xticks(np.arange(2), ['No', 'Yes'])
118
+ plt.show()
119
+
120
+ plt.figure(figsize=(10,5))
121
+ sns.countplot(x=target,hue=df.ExerciseAngina)
122
+ plt.xticks(np.arange(2), ['No', 'Yes'])
123
+ plt.show()
124
+
125
+ # feature importance
126
+
127
+ # get important features used by model
128
+ importances = rfc1.feature_importances_
129
+ feature_names = num_col
130
+ for i in cat_col:
131
+ feature_names = feature_names + [i]*df[i].nunique()
132
+
133
+ import pandas as pd
134
+
135
+ forest_importances = pd.Series(importances, index=feature_names)
136
+
137
+ forest_importances = forest_importances.groupby(level=0).first().sort_values(ascending=False)
138
+
139
+ # plot the features based on their importance in model performance.
140
+ fig, ax = plt.subplots()
141
+ forest_importances.plot.bar()
142
+ ax.set_title("Feature importances using MDI")
143
+ ax.set_ylabel("Mean decrease in impurity")
144
+ fig.tight_layout()
rfc.pickle ADDED
Binary file (428 kB). View file