Spaces:
Runtime error
Runtime error
kmckee95
commited on
Commit
•
0b7a569
1
Parent(s):
8a82e75
Delete heartdisease.py
Browse files- heartdisease.py +0 -144
heartdisease.py
DELETED
@@ -1,144 +0,0 @@
|
|
1 |
-
# import the library
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
import seaborn as sns
|
5 |
-
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
|
6 |
-
from sklearn.preprocessing import StandardScaler
|
7 |
-
|
8 |
-
from sklearn.impute import KNNImputer
|
9 |
-
from sklearn.pipeline import Pipeline
|
10 |
-
from sklearn.compose import ColumnTransformer
|
11 |
-
|
12 |
-
|
13 |
-
from sklearn.linear_model import LogisticRegression
|
14 |
-
from sklearn.ensemble import RandomForestClassifier
|
15 |
-
from sklearn.ensemble import GradientBoostingClassifier
|
16 |
-
|
17 |
-
#libraries for model evaluation
|
18 |
-
import matplotlib.pyplot as plt
|
19 |
-
from sklearn.metrics import accuracy_score
|
20 |
-
from sklearn.metrics import plot_confusion_matrix
|
21 |
-
from sklearn.metrics import classification_report
|
22 |
-
|
23 |
-
import warnings
|
24 |
-
warnings.filterwarnings('ignore')
|
25 |
-
|
26 |
-
# read the dataset
|
27 |
-
df = pd.read_csv('heart.csv')
|
28 |
-
|
29 |
-
# get categorical columns
|
30 |
-
categorical_cols= df.select_dtypes(include=['object'])
|
31 |
-
|
32 |
-
# get count of unique values for categorical columns
|
33 |
-
for cols in categorical_cols.columns:
|
34 |
-
print(cols,':', len(categorical_cols[cols].unique()),'labels')
|
35 |
-
|
36 |
-
# categorical columns
|
37 |
-
cat_col = categorical_cols.columns
|
38 |
-
|
39 |
-
# numerical column
|
40 |
-
num_col = ['Age','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak']
|
41 |
-
|
42 |
-
# define X and y
|
43 |
-
X = df.drop(['HeartDisease'],axis=1)
|
44 |
-
y = df['HeartDisease']
|
45 |
-
|
46 |
-
# create a pipeline for preprocessing the dataset
|
47 |
-
|
48 |
-
num_pipeline = Pipeline([
|
49 |
-
('imputer', KNNImputer(n_neighbors=5)),
|
50 |
-
('std_scaler', StandardScaler()),
|
51 |
-
])
|
52 |
-
|
53 |
-
num_attribs = num_col
|
54 |
-
cat_attribs = cat_col
|
55 |
-
|
56 |
-
# apply transformation to the numerical and categorical columns
|
57 |
-
full_pipeline = ColumnTransformer([
|
58 |
-
("num", num_pipeline, num_attribs),
|
59 |
-
("cat", OneHotEncoder(), cat_attribs),
|
60 |
-
])
|
61 |
-
|
62 |
-
X = full_pipeline.fit_transform(X)
|
63 |
-
|
64 |
-
# save preprocessed data
|
65 |
-
temp_df = pd.DataFrame(X)
|
66 |
-
temp_df.to_csv('processed_data.csv')
|
67 |
-
|
68 |
-
# Splitting the dataset into the Training set and Test set
|
69 |
-
from sklearn.model_selection import train_test_split
|
70 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
|
71 |
-
|
72 |
-
# count plot for number of heart disease(1)/No heart disease(0)
|
73 |
-
import seaborn as sns
|
74 |
-
sns.countplot(y_train,palette='OrRd')
|
75 |
-
|
76 |
-
# create a fresh model based on tuned parameters
|
77 |
-
rfc1=RandomForestClassifier(random_state=42, max_features='sqrt', n_estimators= 50, max_depth=7, criterion='gini')
|
78 |
-
|
79 |
-
rfc1.fit(X_train, y_train)
|
80 |
-
|
81 |
-
# Predicting the Test set results
|
82 |
-
y_pred = rfc1.predict(X_test)
|
83 |
-
print('Random forest accuracy_score:',accuracy_score(y_test,y_pred))
|
84 |
-
|
85 |
-
# Save the Model
|
86 |
-
|
87 |
-
import pickle
|
88 |
-
|
89 |
-
# save the random forest model for future use
|
90 |
-
pickle.dump(rfc1, open('rfc.pickle', 'wb'))
|
91 |
-
|
92 |
-
# save the preprocessing pipeline
|
93 |
-
pickle.dump(full_pipeline, open('full_pipeline.pickle', 'wb'))
|
94 |
-
|
95 |
-
# Load the Models for future use
|
96 |
-
|
97 |
-
rfc_saved = pickle.load(open('rfc.pickle','rb'))
|
98 |
-
full_pipeline_saved = pickle.load(open('full_pipeline.pickle','rb'))
|
99 |
-
|
100 |
-
# Visualization
|
101 |
-
|
102 |
-
target = df['HeartDisease'].replace([0,1],['Low','High'])
|
103 |
-
|
104 |
-
data = pd.crosstab(index=df['Sex'],
|
105 |
-
columns=target)
|
106 |
-
|
107 |
-
data.plot(kind='bar',stacked=True)
|
108 |
-
plt.show()
|
109 |
-
|
110 |
-
plt.figure(figsize=(10,5))
|
111 |
-
bins=[0,30,50,80]
|
112 |
-
sns.countplot(x=pd.cut(df.Age,bins=bins),hue=target,color='r')
|
113 |
-
plt.show()
|
114 |
-
|
115 |
-
plt.figure(figsize=(10,5))
|
116 |
-
sns.countplot(x=target,hue=df.ChestPainType)
|
117 |
-
plt.xticks(np.arange(2), ['No', 'Yes'])
|
118 |
-
plt.show()
|
119 |
-
|
120 |
-
plt.figure(figsize=(10,5))
|
121 |
-
sns.countplot(x=target,hue=df.ExerciseAngina)
|
122 |
-
plt.xticks(np.arange(2), ['No', 'Yes'])
|
123 |
-
plt.show()
|
124 |
-
|
125 |
-
# feature importance
|
126 |
-
|
127 |
-
# get important features used by model
|
128 |
-
importances = rfc1.feature_importances_
|
129 |
-
feature_names = num_col
|
130 |
-
for i in cat_col:
|
131 |
-
feature_names = feature_names + [i]*df[i].nunique()
|
132 |
-
|
133 |
-
import pandas as pd
|
134 |
-
|
135 |
-
forest_importances = pd.Series(importances, index=feature_names)
|
136 |
-
|
137 |
-
forest_importances = forest_importances.groupby(level=0).first().sort_values(ascending=False)
|
138 |
-
|
139 |
-
# plot the features based on their importance in model performance.
|
140 |
-
fig, ax = plt.subplots()
|
141 |
-
forest_importances.plot.bar()
|
142 |
-
ax.set_title("Feature importances using MDI")
|
143 |
-
ax.set_ylabel("Mean decrease in impurity")
|
144 |
-
fig.tight_layout()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|