benjaminzuckermanbasisscottsdale
commited on
Commit
•
ac88758
1
Parent(s):
a0fb7c7
Upload 5 files
Browse files- Procfile.txt +1 -0
- app.py +319 -0
- datasetforriskofcardiodisease (1).csv +0 -0
- requirements.txt +8 -0
- setup.sh +2 -0
Procfile.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
web: source setup.sh && python app.py
|
app.py
ADDED
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#importing data analysis libraries
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import seaborn as sns
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from sklearn.metrics import roc_curve, roc_auc_score
|
7 |
+
from sklearn.metrics import confusion_matrix ,classification_report,precision_score, recall_score ,f1_score
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
from sklearn.linear_model import LogisticRegression
|
10 |
+
from sklearn.neighbors import KNeighborsClassifier
|
11 |
+
from sklearn.naive_bayes import GaussianNB
|
12 |
+
from sklearn.svm import SVC
|
13 |
+
import warnings
|
14 |
+
warnings.filterwarnings('ignore')
|
15 |
+
|
16 |
+
#data is from https://www.kaggle.com/datasets/thedevastator/exploring-risk-factors-for-cardiovascular-diseas
|
17 |
+
data = pd.read_csv('datasetforriskofcardiodisease.csv')
|
18 |
+
data_num = data[['age','height','weight','ap_hi','ap_lo']]
|
19 |
+
data_cat = data[['gender','cholesterol','gluc','smoke','alco','active']]
|
20 |
+
xaxis = ['Age', 'Height', 'Weight', 'Systolic Blood Pressure', 'Diastolic Blood Pressure']
|
21 |
+
|
22 |
+
for i, col in enumerate(data_num.columns):
|
23 |
+
plt.hist(data_num[col])
|
24 |
+
plt.title(f'Frequency vs. {xaxis[i]}')
|
25 |
+
plt.xlabel(xaxis[i])
|
26 |
+
plt.ylabel('Frequency')
|
27 |
+
#plt.show()
|
28 |
+
|
29 |
+
pd.pivot_table(data, index='cardio', values=['age','height','weight','ap_hi','ap_lo'])
|
30 |
+
|
31 |
+
for i in data_cat.columns:
|
32 |
+
sns.barplot(x=data_cat[i].value_counts().index,y=data_cat[i].value_counts()).set_title(i)
|
33 |
+
#plt.show()
|
34 |
+
|
35 |
+
#age and categorical variables
|
36 |
+
print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='age'))
|
37 |
+
print("="*100)
|
38 |
+
print(pd.pivot_table(data,index='cardio',columns='gluc', values='age'))
|
39 |
+
print("="*100)
|
40 |
+
print(pd.pivot_table(data,index='cardio',columns='smoke', values='age'))
|
41 |
+
print("="*100)
|
42 |
+
print(pd.pivot_table(data,index='cardio',columns='alco', values='age'))
|
43 |
+
print("="*100)
|
44 |
+
print(pd.pivot_table(data,index='cardio',columns='active', values='age'))
|
45 |
+
#ap_hi (systolic blood pressure) and categorical variables
|
46 |
+
print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_hi'))
|
47 |
+
print("="*100)
|
48 |
+
print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_hi'))
|
49 |
+
print("="*100)
|
50 |
+
print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_hi'))
|
51 |
+
print("="*100)
|
52 |
+
print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_hi'))
|
53 |
+
print("="*100)
|
54 |
+
print(pd.pivot_table(data,index='cardio',columns='active', values='ap_hi'))
|
55 |
+
#ap_low (diastolic blood pressure) and categorical variables
|
56 |
+
print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_lo'))
|
57 |
+
print("="*100)
|
58 |
+
print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_lo'))
|
59 |
+
print("="*100)
|
60 |
+
print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_lo'))
|
61 |
+
print("="*100)
|
62 |
+
print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_lo'))
|
63 |
+
print("="*100)
|
64 |
+
print(pd.pivot_table(data,index='cardio',columns='active', values='ap_lo'))
|
65 |
+
|
66 |
+
for i in data_num.columns:
|
67 |
+
sns.boxplot(data_num[i])
|
68 |
+
plt.title(i)
|
69 |
+
#plt.show()
|
70 |
+
|
71 |
+
#Getting interquartile range
|
72 |
+
def outlinefree(dataCol):
|
73 |
+
sorted(dataCol)
|
74 |
+
Q1,Q3 = np.percentile(dataCol,[25,75])
|
75 |
+
IQR = Q3-Q1
|
76 |
+
LowerRange = Q1-(1.5 * IQR)
|
77 |
+
UpperRange = Q3+(1.5 * IQR)
|
78 |
+
return LowerRange,UpperRange
|
79 |
+
|
80 |
+
#Removing outliers
|
81 |
+
lwap_hi,upap_hi = outlinefree(data['ap_hi'])
|
82 |
+
lwap_lo,upap_lo = outlinefree(data['ap_lo'])
|
83 |
+
|
84 |
+
|
85 |
+
data['ap_hi'].replace(list(data[data['ap_hi'] > upap_hi].ap_hi) ,upap_hi,inplace=True)
|
86 |
+
data['ap_lo'].replace(list(data[data['ap_lo'] > upap_lo].ap_lo) ,upap_lo,inplace=True)
|
87 |
+
|
88 |
+
|
89 |
+
features = data.iloc[:,:-1].values
|
90 |
+
label = data.iloc[:,-1].values
|
91 |
+
|
92 |
+
#------------------------LogisticRegression-----------------------
|
93 |
+
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102)
|
94 |
+
|
95 |
+
classimodel= LogisticRegression()
|
96 |
+
classimodel.fit(X_train, y_train)
|
97 |
+
trainscore = classimodel.score(X_train,y_train)
|
98 |
+
testscore = classimodel.score(X_test,y_test)
|
99 |
+
print("Logistic Regression-----------------------------------------------------\n")
|
100 |
+
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
|
101 |
+
|
102 |
+
y_pred = classimodel.predict(X_test)
|
103 |
+
|
104 |
+
#from sklearn.metrics import confusion_matrix
|
105 |
+
confusion_matrix(y_test, y_pred)
|
106 |
+
|
107 |
+
print(' f1 score: ',f1_score(y_test, y_pred),'\n')
|
108 |
+
print(' precision score: ',precision_score(y_test, y_pred),'\n')
|
109 |
+
print(' recall score: ',recall_score(y_test, y_pred),'\n')
|
110 |
+
print(classification_report(y_test, y_pred))
|
111 |
+
|
112 |
+
#--------------------------------------K-Nearest Neighbor(KNN)-----------------
|
113 |
+
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=193)
|
114 |
+
|
115 |
+
|
116 |
+
classifier= KNeighborsClassifier()
|
117 |
+
knnmodel = classifier.fit(X_train, y_train)
|
118 |
+
|
119 |
+
trainscore = knnmodel.score(X_train,y_train)
|
120 |
+
testscore = knnmodel.score(X_test,y_test)
|
121 |
+
print("KNN-----------------------------------------------------\n")
|
122 |
+
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
|
123 |
+
|
124 |
+
y_predknn = knnmodel.predict(X_test)
|
125 |
+
|
126 |
+
print(confusion_matrix(y_test, y_predknn))
|
127 |
+
|
128 |
+
print("f1_score: ",f1_score(y_test, y_predknn),'\n')
|
129 |
+
print("precision_score: ",precision_score(y_test, y_predknn),'\n')
|
130 |
+
print("recall_score: ",recall_score(y_test, y_predknn),'\n')
|
131 |
+
print(classification_report(y_test, y_predknn))
|
132 |
+
|
133 |
+
#------------------------------naive bayes---------------------------
|
134 |
+
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=34)
|
135 |
+
|
136 |
+
NBmodel = GaussianNB()
|
137 |
+
NBmodel.fit(X_train, y_train)
|
138 |
+
|
139 |
+
trainscore = NBmodel.score(X_train,y_train)
|
140 |
+
testscore = NBmodel.score(X_test,y_test)
|
141 |
+
print("Naive Bayes-----------------------------------------------------\n")
|
142 |
+
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
|
143 |
+
y_predNB = NBmodel.predict(X_test)
|
144 |
+
print(confusion_matrix(y_test, y_predNB))
|
145 |
+
|
146 |
+
print("f1_score: ",f1_score(y_test, y_predNB),'\n')
|
147 |
+
print("precision_score: ",precision_score(y_test, y_predNB),'\n')
|
148 |
+
print("recall_score: ",recall_score(y_test, y_predNB),'\n')
|
149 |
+
print(classification_report(y_test, y_predNB))
|
150 |
+
|
151 |
+
|
152 |
+
#-------------------------------- XGBoost -------------------------------------
|
153 |
+
import xgboost as xgb
|
154 |
+
from sklearn.metrics import mean_squared_error
|
155 |
+
import pandas as pd
|
156 |
+
import numpy as np
|
157 |
+
|
158 |
+
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102)
|
159 |
+
|
160 |
+
XGmodel= xgb.XGBRFClassifier()
|
161 |
+
XGmodel.fit(X_train, y_train)
|
162 |
+
trainscore = XGmodel.score(X_train,y_train)
|
163 |
+
testscore = XGmodel.score(X_test,y_test)
|
164 |
+
print("XGBoost-----------------------------------------------------\n")
|
165 |
+
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
|
166 |
+
|
167 |
+
y_predXG = XGmodel.predict(X_test)
|
168 |
+
|
169 |
+
confusion_matrix(y_test, y_pred)
|
170 |
+
|
171 |
+
print("f1_score: ",f1_score(y_test, y_predXG),'\n')
|
172 |
+
print("precision_score: ",precision_score(y_test, y_predXG),'\n')
|
173 |
+
print("recall_score: ",recall_score(y_test, y_predXG),'\n')
|
174 |
+
print(classification_report(y_test, y_predXG),'\n')
|
175 |
+
print("AREA UNDER CURVES-----------------------------------------------------\n")
|
176 |
+
#-------------------------------------- LogisticRegression -------------------------------------
|
177 |
+
probabilityValues = classimodel.predict_proba(features)[:,1]
|
178 |
+
#Calculate AUC
|
179 |
+
auc = roc_auc_score(label,probabilityValues)
|
180 |
+
print(auc)
|
181 |
+
#Calculate roc_curve
|
182 |
+
fpr,tpr, threshold = roc_curve(label,probabilityValues)
|
183 |
+
plt.plot([0,1],[0,1], linestyle = '--')
|
184 |
+
plt.plot(fpr,tpr)
|
185 |
+
|
186 |
+
#-------------------------------------- KNeighborsClassifier -------------------------------------
|
187 |
+
probabilityValues = knnmodel.predict_proba(features)[:,1]
|
188 |
+
#Calculate AUC
|
189 |
+
auc = roc_auc_score(label,probabilityValues)
|
190 |
+
print(auc)
|
191 |
+
#Calculate roc_curve
|
192 |
+
fpr,tpr, threshold = roc_curve(label,probabilityValues)
|
193 |
+
plt.plot([0,1],[0,1], linestyle = '--')
|
194 |
+
plt.plot(fpr,tpr)
|
195 |
+
|
196 |
+
#-------------------------------------- naive bayes -------------------------------------
|
197 |
+
probabilityValues = NBmodel.predict_proba(features)[:,1]
|
198 |
+
#Calculate AUC
|
199 |
+
auc = roc_auc_score(label,probabilityValues)
|
200 |
+
print(auc)
|
201 |
+
#Calculate roc_curve
|
202 |
+
fpr,tpr, threshold = roc_curve(label,probabilityValues)
|
203 |
+
plt.plot([0,1],[0,1], linestyle = '--')
|
204 |
+
plt.plot(fpr,tpr)
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
#-------------------------------------- XGBoost -------------------------------------
|
209 |
+
probabilityValues = XGmodel.predict_proba(features)[:,1]
|
210 |
+
#Calculate AUC
|
211 |
+
auc = roc_auc_score(label,probabilityValues)
|
212 |
+
print(auc)
|
213 |
+
#Calculate roc_curve
|
214 |
+
fpr,tpr, threshold = roc_curve(label,probabilityValues)
|
215 |
+
plt.plot([0,1],[0,1], linestyle = '--')
|
216 |
+
plt.plot(fpr,tpr)
|
217 |
+
'''
|
218 |
+
#--------------------------------------INTERACE TIME LETS GO BOYS-----------------------
|
219 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
220 |
+
import joblib
|
221 |
+
import matplotlib
|
222 |
+
matplotlib.use("agg")
|
223 |
+
model_file_name = 'XG_best_model.joblib'
|
224 |
+
model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\'
|
225 |
+
joblib.dump(XGmodel, model_folder+''+model_file_name)
|
226 |
+
|
227 |
+
#Loading da model
|
228 |
+
loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))
|
229 |
+
print (loaded_XG_model)
|
230 |
+
def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5):
|
231 |
+
input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1)
|
232 |
+
prediction = loaded_XG_model.predict(input_array)
|
233 |
+
info = ''
|
234 |
+
if prediction[0] == 0:
|
235 |
+
info = "You are not currently at risk of a cardiovascular disease! ✅"
|
236 |
+
else:
|
237 |
+
info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨"
|
238 |
+
final_info = "The prediction is: {}".format(info)
|
239 |
+
print (prediction[0])
|
240 |
+
return final_info
|
241 |
+
|
242 |
+
input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1]
|
243 |
+
result = make_prediction(*input_values)
|
244 |
+
print(result)
|
245 |
+
|
246 |
+
#------------------------------------------------GRADIO Time lmfao
|
247 |
+
import gradio as gr
|
248 |
+
|
249 |
+
|
250 |
+
headline = "Cardiovascular Disease Risk Prediction Application"
|
251 |
+
iface = gr.Interface(fn=make_prediction, inputs=
|
252 |
+
[gr.inputs.Number(label="Age (Years)"),
|
253 |
+
gr.inputs.Checkbox(label="I am a male"),
|
254 |
+
gr.inputs.Number(label="Height (cm)"),
|
255 |
+
gr.inputs.Number(label="Weight (kg)"),
|
256 |
+
gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"),
|
257 |
+
gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"),
|
258 |
+
gr.inputs.Number(label="Cholesterol (per 20mg/dL)"),
|
259 |
+
gr.inputs.Number(label="Glucose (per 1 mmol/L)"),
|
260 |
+
gr.inputs.Checkbox(label="I have smoked."),
|
261 |
+
gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."),
|
262 |
+
gr.inputs.Checkbox(label="I am physically active.")
|
263 |
+
], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft')
|
264 |
+
|
265 |
+
if __name__ == "__main__":
|
266 |
+
iface.launch(share=True)
|
267 |
+
'''
|
268 |
+
#--------------------------------------INTERACE TIME LETS GO BOYS-----------------------
|
269 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
270 |
+
import joblib
|
271 |
+
import matplotlib
|
272 |
+
matplotlib.use("agg")
|
273 |
+
model_file_name = 'XG_best_model.joblib'
|
274 |
+
model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\'
|
275 |
+
joblib.dump(XGmodel, model_folder+''+model_file_name)
|
276 |
+
|
277 |
+
#Loading da model
|
278 |
+
loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))
|
279 |
+
print (loaded_XG_model)
|
280 |
+
def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5):
|
281 |
+
checkbox1 = 1 if "Male" in checkbox1 else 0
|
282 |
+
input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1)
|
283 |
+
prediction = loaded_XG_model.predict(input_array)
|
284 |
+
info = ''
|
285 |
+
if prediction[0] == 0:
|
286 |
+
info = "You are not currently at risk of a cardiovascular disease! ✅"
|
287 |
+
else:
|
288 |
+
info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨"
|
289 |
+
final_info = "The prediction is: {}".format(info)
|
290 |
+
return final_info
|
291 |
+
|
292 |
+
#input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1]
|
293 |
+
#result = make_prediction(*input_values)
|
294 |
+
#print(result)
|
295 |
+
|
296 |
+
#------------------------------------------------GRADIO Time lmfao
|
297 |
+
import gradio as gr
|
298 |
+
|
299 |
+
|
300 |
+
headline = "Cardiovascular Disease Risk Prediction Application"
|
301 |
+
iface = gr.Interface(fn=make_prediction, inputs=
|
302 |
+
[gr.inputs.Number(label="Age (Years)"),
|
303 |
+
gr.inputs.CheckboxGroup(
|
304 |
+
label="Gender",
|
305 |
+
choices=["Male", "Female"],
|
306 |
+
),
|
307 |
+
gr.inputs.Number(label="Height (cm)"),
|
308 |
+
gr.inputs.Number(label="Weight (kg)"),
|
309 |
+
gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"),
|
310 |
+
gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"),
|
311 |
+
gr.inputs.Number(label="Cholesterol (per 20mg/dL)"),
|
312 |
+
gr.inputs.Number(label="Glucose (per 1 mmol/L)"),
|
313 |
+
gr.inputs.Checkbox(label="I have smoked."),
|
314 |
+
gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."),
|
315 |
+
gr.inputs.Checkbox(label="I am physically active.")
|
316 |
+
], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft')
|
317 |
+
|
318 |
+
if __name__ == "__main__":
|
319 |
+
iface.launch(share=False)
|
datasetforriskofcardiodisease (1).csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
seaborn
|
4 |
+
matplotlib
|
5 |
+
scikit-learn
|
6 |
+
xgboost
|
7 |
+
gradio
|
8 |
+
joblib
|
setup.sh
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
export GRADIO_SERVER_NAME=0.0.0.0
|
2 |
+
export GRADIO_SERVER_PORT="$PORT"
|