kmckee95 commited on
Commit
0b7a569
1 Parent(s): 8a82e75

Delete heartdisease.py

Browse files
Files changed (1) hide show
  1. heartdisease.py +0 -144
heartdisease.py DELETED
@@ -1,144 +0,0 @@
1
- # import the library
2
- import pandas as pd
3
- import numpy as np
4
- import seaborn as sns
5
- from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
6
- from sklearn.preprocessing import StandardScaler
7
-
8
- from sklearn.impute import KNNImputer
9
- from sklearn.pipeline import Pipeline
10
- from sklearn.compose import ColumnTransformer
11
-
12
-
13
- from sklearn.linear_model import LogisticRegression
14
- from sklearn.ensemble import RandomForestClassifier
15
- from sklearn.ensemble import GradientBoostingClassifier
16
-
17
- #libraries for model evaluation
18
- import matplotlib.pyplot as plt
19
- from sklearn.metrics import accuracy_score
20
- from sklearn.metrics import plot_confusion_matrix
21
- from sklearn.metrics import classification_report
22
-
23
- import warnings
24
- warnings.filterwarnings('ignore')
25
-
26
- # read the dataset
27
- df = pd.read_csv('heart.csv')
28
-
29
- # get categorical columns
30
- categorical_cols= df.select_dtypes(include=['object'])
31
-
32
- # get count of unique values for categorical columns
33
- for cols in categorical_cols.columns:
34
- print(cols,':', len(categorical_cols[cols].unique()),'labels')
35
-
36
- # categorical columns
37
- cat_col = categorical_cols.columns
38
-
39
- # numerical column
40
- num_col = ['Age','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak']
41
-
42
- # define X and y
43
- X = df.drop(['HeartDisease'],axis=1)
44
- y = df['HeartDisease']
45
-
46
- # create a pipeline for preprocessing the dataset
47
-
48
- num_pipeline = Pipeline([
49
- ('imputer', KNNImputer(n_neighbors=5)),
50
- ('std_scaler', StandardScaler()),
51
- ])
52
-
53
- num_attribs = num_col
54
- cat_attribs = cat_col
55
-
56
- # apply transformation to the numerical and categorical columns
57
- full_pipeline = ColumnTransformer([
58
- ("num", num_pipeline, num_attribs),
59
- ("cat", OneHotEncoder(), cat_attribs),
60
- ])
61
-
62
- X = full_pipeline.fit_transform(X)
63
-
64
- # save preprocessed data
65
- temp_df = pd.DataFrame(X)
66
- temp_df.to_csv('processed_data.csv')
67
-
68
- # Splitting the dataset into the Training set and Test set
69
- from sklearn.model_selection import train_test_split
70
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
71
-
72
- # count plot for number of heart disease(1)/No heart disease(0)
73
- import seaborn as sns
74
- sns.countplot(y_train,palette='OrRd')
75
-
76
- # create a fresh model based on tuned parameters
77
- rfc1=RandomForestClassifier(random_state=42, max_features='sqrt', n_estimators= 50, max_depth=7, criterion='gini')
78
-
79
- rfc1.fit(X_train, y_train)
80
-
81
- # Predicting the Test set results
82
- y_pred = rfc1.predict(X_test)
83
- print('Random forest accuracy_score:',accuracy_score(y_test,y_pred))
84
-
85
- # Save the Model
86
-
87
- import pickle
88
-
89
- # save the random forest model for future use
90
- pickle.dump(rfc1, open('rfc.pickle', 'wb'))
91
-
92
- # save the preprocessing pipeline
93
- pickle.dump(full_pipeline, open('full_pipeline.pickle', 'wb'))
94
-
95
- # Load the Models for future use
96
-
97
- rfc_saved = pickle.load(open('rfc.pickle','rb'))
98
- full_pipeline_saved = pickle.load(open('full_pipeline.pickle','rb'))
99
-
100
- # Visualization
101
-
102
- target = df['HeartDisease'].replace([0,1],['Low','High'])
103
-
104
- data = pd.crosstab(index=df['Sex'],
105
- columns=target)
106
-
107
- data.plot(kind='bar',stacked=True)
108
- plt.show()
109
-
110
- plt.figure(figsize=(10,5))
111
- bins=[0,30,50,80]
112
- sns.countplot(x=pd.cut(df.Age,bins=bins),hue=target,color='r')
113
- plt.show()
114
-
115
- plt.figure(figsize=(10,5))
116
- sns.countplot(x=target,hue=df.ChestPainType)
117
- plt.xticks(np.arange(2), ['No', 'Yes'])
118
- plt.show()
119
-
120
- plt.figure(figsize=(10,5))
121
- sns.countplot(x=target,hue=df.ExerciseAngina)
122
- plt.xticks(np.arange(2), ['No', 'Yes'])
123
- plt.show()
124
-
125
- # feature importance
126
-
127
- # get important features used by model
128
- importances = rfc1.feature_importances_
129
- feature_names = num_col
130
- for i in cat_col:
131
- feature_names = feature_names + [i]*df[i].nunique()
132
-
133
- import pandas as pd
134
-
135
- forest_importances = pd.Series(importances, index=feature_names)
136
-
137
- forest_importances = forest_importances.groupby(level=0).first().sort_values(ascending=False)
138
-
139
- # plot the features based on their importance in model performance.
140
- fig, ax = plt.subplots()
141
- forest_importances.plot.bar()
142
- ax.set_title("Feature importances using MDI")
143
- ax.set_ylabel("Mean decrease in impurity")
144
- fig.tight_layout()