Commit
ac88758
•
1 Parent(s): a0fb7c7

Upload 5 files

Browse files
Files changed (5) hide show
  1. Procfile.txt +1 -0
  2. app.py +319 -0
  3. datasetforriskofcardiodisease (1).csv +0 -0
  4. requirements.txt +8 -0
  5. setup.sh +2 -0
Procfile.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ web: source setup.sh && python app.py
app.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #importing data analysis libraries
2
+ import numpy as np
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.metrics import roc_curve, roc_auc_score
7
+ from sklearn.metrics import confusion_matrix ,classification_report,precision_score, recall_score ,f1_score
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.neighbors import KNeighborsClassifier
11
+ from sklearn.naive_bayes import GaussianNB
12
+ from sklearn.svm import SVC
13
+ import warnings
14
+ warnings.filterwarnings('ignore')
15
+
16
+ #data is from https://www.kaggle.com/datasets/thedevastator/exploring-risk-factors-for-cardiovascular-diseas
17
+ data = pd.read_csv('datasetforriskofcardiodisease.csv')
18
+ data_num = data[['age','height','weight','ap_hi','ap_lo']]
19
+ data_cat = data[['gender','cholesterol','gluc','smoke','alco','active']]
20
+ xaxis = ['Age', 'Height', 'Weight', 'Systolic Blood Pressure', 'Diastolic Blood Pressure']
21
+
22
+ for i, col in enumerate(data_num.columns):
23
+ plt.hist(data_num[col])
24
+ plt.title(f'Frequency vs. {xaxis[i]}')
25
+ plt.xlabel(xaxis[i])
26
+ plt.ylabel('Frequency')
27
+ #plt.show()
28
+
29
+ pd.pivot_table(data, index='cardio', values=['age','height','weight','ap_hi','ap_lo'])
30
+
31
+ for i in data_cat.columns:
32
+ sns.barplot(x=data_cat[i].value_counts().index,y=data_cat[i].value_counts()).set_title(i)
33
+ #plt.show()
34
+
35
+ #age and categorical variables
36
+ print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='age'))
37
+ print("="*100)
38
+ print(pd.pivot_table(data,index='cardio',columns='gluc', values='age'))
39
+ print("="*100)
40
+ print(pd.pivot_table(data,index='cardio',columns='smoke', values='age'))
41
+ print("="*100)
42
+ print(pd.pivot_table(data,index='cardio',columns='alco', values='age'))
43
+ print("="*100)
44
+ print(pd.pivot_table(data,index='cardio',columns='active', values='age'))
45
+ #ap_hi (systolic blood pressure) and categorical variables
46
+ print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_hi'))
47
+ print("="*100)
48
+ print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_hi'))
49
+ print("="*100)
50
+ print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_hi'))
51
+ print("="*100)
52
+ print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_hi'))
53
+ print("="*100)
54
+ print(pd.pivot_table(data,index='cardio',columns='active', values='ap_hi'))
55
+ #ap_low (diastolic blood pressure) and categorical variables
56
+ print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_lo'))
57
+ print("="*100)
58
+ print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_lo'))
59
+ print("="*100)
60
+ print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_lo'))
61
+ print("="*100)
62
+ print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_lo'))
63
+ print("="*100)
64
+ print(pd.pivot_table(data,index='cardio',columns='active', values='ap_lo'))
65
+
66
+ for i in data_num.columns:
67
+ sns.boxplot(data_num[i])
68
+ plt.title(i)
69
+ #plt.show()
70
+
71
+ #Getting interquartile range
72
+ def outlinefree(dataCol):
73
+ sorted(dataCol)
74
+ Q1,Q3 = np.percentile(dataCol,[25,75])
75
+ IQR = Q3-Q1
76
+ LowerRange = Q1-(1.5 * IQR)
77
+ UpperRange = Q3+(1.5 * IQR)
78
+ return LowerRange,UpperRange
79
+
80
+ #Removing outliers
81
+ lwap_hi,upap_hi = outlinefree(data['ap_hi'])
82
+ lwap_lo,upap_lo = outlinefree(data['ap_lo'])
83
+
84
+
85
+ data['ap_hi'].replace(list(data[data['ap_hi'] > upap_hi].ap_hi) ,upap_hi,inplace=True)
86
+ data['ap_lo'].replace(list(data[data['ap_lo'] > upap_lo].ap_lo) ,upap_lo,inplace=True)
87
+
88
+
89
+ features = data.iloc[:,:-1].values
90
+ label = data.iloc[:,-1].values
91
+
92
+ #------------------------LogisticRegression-----------------------
93
+ X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102)
94
+
95
+ classimodel= LogisticRegression()
96
+ classimodel.fit(X_train, y_train)
97
+ trainscore = classimodel.score(X_train,y_train)
98
+ testscore = classimodel.score(X_test,y_test)
99
+ print("Logistic Regression-----------------------------------------------------\n")
100
+ print("test score: {} train score: {}".format(testscore,trainscore),'\n')
101
+
102
+ y_pred = classimodel.predict(X_test)
103
+
104
+ #from sklearn.metrics import confusion_matrix
105
+ confusion_matrix(y_test, y_pred)
106
+
107
+ print(' f1 score: ',f1_score(y_test, y_pred),'\n')
108
+ print(' precision score: ',precision_score(y_test, y_pred),'\n')
109
+ print(' recall score: ',recall_score(y_test, y_pred),'\n')
110
+ print(classification_report(y_test, y_pred))
111
+
112
+ #--------------------------------------K-Nearest Neighbor(KNN)-----------------
113
+ X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=193)
114
+
115
+
116
+ classifier= KNeighborsClassifier()
117
+ knnmodel = classifier.fit(X_train, y_train)
118
+
119
+ trainscore = knnmodel.score(X_train,y_train)
120
+ testscore = knnmodel.score(X_test,y_test)
121
+ print("KNN-----------------------------------------------------\n")
122
+ print("test score: {} train score: {}".format(testscore,trainscore),'\n')
123
+
124
+ y_predknn = knnmodel.predict(X_test)
125
+
126
+ print(confusion_matrix(y_test, y_predknn))
127
+
128
+ print("f1_score: ",f1_score(y_test, y_predknn),'\n')
129
+ print("precision_score: ",precision_score(y_test, y_predknn),'\n')
130
+ print("recall_score: ",recall_score(y_test, y_predknn),'\n')
131
+ print(classification_report(y_test, y_predknn))
132
+
133
+ #------------------------------naive bayes---------------------------
134
+ X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=34)
135
+
136
+ NBmodel = GaussianNB()
137
+ NBmodel.fit(X_train, y_train)
138
+
139
+ trainscore = NBmodel.score(X_train,y_train)
140
+ testscore = NBmodel.score(X_test,y_test)
141
+ print("Naive Bayes-----------------------------------------------------\n")
142
+ print("test score: {} train score: {}".format(testscore,trainscore),'\n')
143
+ y_predNB = NBmodel.predict(X_test)
144
+ print(confusion_matrix(y_test, y_predNB))
145
+
146
+ print("f1_score: ",f1_score(y_test, y_predNB),'\n')
147
+ print("precision_score: ",precision_score(y_test, y_predNB),'\n')
148
+ print("recall_score: ",recall_score(y_test, y_predNB),'\n')
149
+ print(classification_report(y_test, y_predNB))
150
+
151
+
152
+ #-------------------------------- XGBoost -------------------------------------
153
+ import xgboost as xgb
154
+ from sklearn.metrics import mean_squared_error
155
+ import pandas as pd
156
+ import numpy as np
157
+
158
+ X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102)
159
+
160
+ XGmodel= xgb.XGBRFClassifier()
161
+ XGmodel.fit(X_train, y_train)
162
+ trainscore = XGmodel.score(X_train,y_train)
163
+ testscore = XGmodel.score(X_test,y_test)
164
+ print("XGBoost-----------------------------------------------------\n")
165
+ print("test score: {} train score: {}".format(testscore,trainscore),'\n')
166
+
167
+ y_predXG = XGmodel.predict(X_test)
168
+
169
+ confusion_matrix(y_test, y_pred)
170
+
171
+ print("f1_score: ",f1_score(y_test, y_predXG),'\n')
172
+ print("precision_score: ",precision_score(y_test, y_predXG),'\n')
173
+ print("recall_score: ",recall_score(y_test, y_predXG),'\n')
174
+ print(classification_report(y_test, y_predXG),'\n')
175
+ print("AREA UNDER CURVES-----------------------------------------------------\n")
176
+ #-------------------------------------- LogisticRegression -------------------------------------
177
+ probabilityValues = classimodel.predict_proba(features)[:,1]
178
+ #Calculate AUC
179
+ auc = roc_auc_score(label,probabilityValues)
180
+ print(auc)
181
+ #Calculate roc_curve
182
+ fpr,tpr, threshold = roc_curve(label,probabilityValues)
183
+ plt.plot([0,1],[0,1], linestyle = '--')
184
+ plt.plot(fpr,tpr)
185
+
186
+ #-------------------------------------- KNeighborsClassifier -------------------------------------
187
+ probabilityValues = knnmodel.predict_proba(features)[:,1]
188
+ #Calculate AUC
189
+ auc = roc_auc_score(label,probabilityValues)
190
+ print(auc)
191
+ #Calculate roc_curve
192
+ fpr,tpr, threshold = roc_curve(label,probabilityValues)
193
+ plt.plot([0,1],[0,1], linestyle = '--')
194
+ plt.plot(fpr,tpr)
195
+
196
+ #-------------------------------------- naive bayes -------------------------------------
197
+ probabilityValues = NBmodel.predict_proba(features)[:,1]
198
+ #Calculate AUC
199
+ auc = roc_auc_score(label,probabilityValues)
200
+ print(auc)
201
+ #Calculate roc_curve
202
+ fpr,tpr, threshold = roc_curve(label,probabilityValues)
203
+ plt.plot([0,1],[0,1], linestyle = '--')
204
+ plt.plot(fpr,tpr)
205
+
206
+
207
+
208
+ #-------------------------------------- XGBoost -------------------------------------
209
+ probabilityValues = XGmodel.predict_proba(features)[:,1]
210
+ #Calculate AUC
211
+ auc = roc_auc_score(label,probabilityValues)
212
+ print(auc)
213
+ #Calculate roc_curve
214
+ fpr,tpr, threshold = roc_curve(label,probabilityValues)
215
+ plt.plot([0,1],[0,1], linestyle = '--')
216
+ plt.plot(fpr,tpr)
217
+ '''
218
+ #--------------------------------------INTERACE TIME LETS GO BOYS-----------------------
219
+ from sklearn.feature_extraction.text import CountVectorizer
220
+ import joblib
221
+ import matplotlib
222
+ matplotlib.use("agg")
223
+ model_file_name = 'XG_best_model.joblib'
224
+ model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\'
225
+ joblib.dump(XGmodel, model_folder+''+model_file_name)
226
+
227
+ #Loading da model
228
+ loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))
229
+ print (loaded_XG_model)
230
+ def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5):
231
+ input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1)
232
+ prediction = loaded_XG_model.predict(input_array)
233
+ info = ''
234
+ if prediction[0] == 0:
235
+ info = "You are not currently at risk of a cardiovascular disease! ✅"
236
+ else:
237
+ info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨"
238
+ final_info = "The prediction is: {}".format(info)
239
+ print (prediction[0])
240
+ return final_info
241
+
242
+ input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1]
243
+ result = make_prediction(*input_values)
244
+ print(result)
245
+
246
+ #------------------------------------------------GRADIO Time lmfao
247
+ import gradio as gr
248
+
249
+
250
+ headline = "Cardiovascular Disease Risk Prediction Application"
251
+ iface = gr.Interface(fn=make_prediction, inputs=
252
+ [gr.inputs.Number(label="Age (Years)"),
253
+ gr.inputs.Checkbox(label="I am a male"),
254
+ gr.inputs.Number(label="Height (cm)"),
255
+ gr.inputs.Number(label="Weight (kg)"),
256
+ gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"),
257
+ gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"),
258
+ gr.inputs.Number(label="Cholesterol (per 20mg/dL)"),
259
+ gr.inputs.Number(label="Glucose (per 1 mmol/L)"),
260
+ gr.inputs.Checkbox(label="I have smoked."),
261
+ gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."),
262
+ gr.inputs.Checkbox(label="I am physically active.")
263
+ ], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft')
264
+
265
+ if __name__ == "__main__":
266
+ iface.launch(share=True)
267
+ '''
268
+ #--------------------------------------INTERACE TIME LETS GO BOYS-----------------------
269
+ from sklearn.feature_extraction.text import CountVectorizer
270
+ import joblib
271
+ import matplotlib
272
+ matplotlib.use("agg")
273
+ model_file_name = 'XG_best_model.joblib'
274
+ model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\'
275
+ joblib.dump(XGmodel, model_folder+''+model_file_name)
276
+
277
+ #Loading da model
278
+ loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))
279
+ print (loaded_XG_model)
280
+ def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5):
281
+ checkbox1 = 1 if "Male" in checkbox1 else 0
282
+ input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1)
283
+ prediction = loaded_XG_model.predict(input_array)
284
+ info = ''
285
+ if prediction[0] == 0:
286
+ info = "You are not currently at risk of a cardiovascular disease! ✅"
287
+ else:
288
+ info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨"
289
+ final_info = "The prediction is: {}".format(info)
290
+ return final_info
291
+
292
+ #input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1]
293
+ #result = make_prediction(*input_values)
294
+ #print(result)
295
+
296
+ #------------------------------------------------GRADIO Time lmfao
297
+ import gradio as gr
298
+
299
+
300
+ headline = "Cardiovascular Disease Risk Prediction Application"
301
+ iface = gr.Interface(fn=make_prediction, inputs=
302
+ [gr.inputs.Number(label="Age (Years)"),
303
+ gr.inputs.CheckboxGroup(
304
+ label="Gender",
305
+ choices=["Male", "Female"],
306
+ ),
307
+ gr.inputs.Number(label="Height (cm)"),
308
+ gr.inputs.Number(label="Weight (kg)"),
309
+ gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"),
310
+ gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"),
311
+ gr.inputs.Number(label="Cholesterol (per 20mg/dL)"),
312
+ gr.inputs.Number(label="Glucose (per 1 mmol/L)"),
313
+ gr.inputs.Checkbox(label="I have smoked."),
314
+ gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."),
315
+ gr.inputs.Checkbox(label="I am physically active.")
316
+ ], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft')
317
+
318
+ if __name__ == "__main__":
319
+ iface.launch(share=False)
datasetforriskofcardiodisease (1).csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ seaborn
4
+ matplotlib
5
+ scikit-learn
6
+ xgboost
7
+ gradio
8
+ joblib
setup.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ export GRADIO_SERVER_NAME=0.0.0.0
2
+ export GRADIO_SERVER_PORT="$PORT"