Toben / app.py
benjaminzuckermanbasisscottsdale's picture
Upload 5 files
ac88758
raw
history blame contribute delete
No virus
13.1 kB
#importing data analysis libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix ,classification_report,precision_score, recall_score ,f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
#data is from https://www.kaggle.com/datasets/thedevastator/exploring-risk-factors-for-cardiovascular-diseas
data = pd.read_csv('datasetforriskofcardiodisease.csv')
data_num = data[['age','height','weight','ap_hi','ap_lo']]
data_cat = data[['gender','cholesterol','gluc','smoke','alco','active']]
xaxis = ['Age', 'Height', 'Weight', 'Systolic Blood Pressure', 'Diastolic Blood Pressure']
for i, col in enumerate(data_num.columns):
plt.hist(data_num[col])
plt.title(f'Frequency vs. {xaxis[i]}')
plt.xlabel(xaxis[i])
plt.ylabel('Frequency')
#plt.show()
pd.pivot_table(data, index='cardio', values=['age','height','weight','ap_hi','ap_lo'])
for i in data_cat.columns:
sns.barplot(x=data_cat[i].value_counts().index,y=data_cat[i].value_counts()).set_title(i)
#plt.show()
#age and categorical variables
print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='age'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='gluc', values='age'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='smoke', values='age'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='alco', values='age'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='active', values='age'))
#ap_hi (systolic blood pressure) and categorical variables
print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_hi'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_hi'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_hi'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_hi'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='active', values='ap_hi'))
#ap_low (diastolic blood pressure) and categorical variables
print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_lo'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_lo'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_lo'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_lo'))
print("="*100)
print(pd.pivot_table(data,index='cardio',columns='active', values='ap_lo'))
for i in data_num.columns:
sns.boxplot(data_num[i])
plt.title(i)
#plt.show()
#Getting interquartile range
def outlinefree(dataCol):
sorted(dataCol)
Q1,Q3 = np.percentile(dataCol,[25,75])
IQR = Q3-Q1
LowerRange = Q1-(1.5 * IQR)
UpperRange = Q3+(1.5 * IQR)
return LowerRange,UpperRange
#Removing outliers
lwap_hi,upap_hi = outlinefree(data['ap_hi'])
lwap_lo,upap_lo = outlinefree(data['ap_lo'])
data['ap_hi'].replace(list(data[data['ap_hi'] > upap_hi].ap_hi) ,upap_hi,inplace=True)
data['ap_lo'].replace(list(data[data['ap_lo'] > upap_lo].ap_lo) ,upap_lo,inplace=True)
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values
#------------------------LogisticRegression-----------------------
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102)
classimodel= LogisticRegression()
classimodel.fit(X_train, y_train)
trainscore = classimodel.score(X_train,y_train)
testscore = classimodel.score(X_test,y_test)
print("Logistic Regression-----------------------------------------------------\n")
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
y_pred = classimodel.predict(X_test)
#from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
print(' f1 score: ',f1_score(y_test, y_pred),'\n')
print(' precision score: ',precision_score(y_test, y_pred),'\n')
print(' recall score: ',recall_score(y_test, y_pred),'\n')
print(classification_report(y_test, y_pred))
#--------------------------------------K-Nearest Neighbor(KNN)-----------------
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=193)
classifier= KNeighborsClassifier()
knnmodel = classifier.fit(X_train, y_train)
trainscore = knnmodel.score(X_train,y_train)
testscore = knnmodel.score(X_test,y_test)
print("KNN-----------------------------------------------------\n")
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
y_predknn = knnmodel.predict(X_test)
print(confusion_matrix(y_test, y_predknn))
print("f1_score: ",f1_score(y_test, y_predknn),'\n')
print("precision_score: ",precision_score(y_test, y_predknn),'\n')
print("recall_score: ",recall_score(y_test, y_predknn),'\n')
print(classification_report(y_test, y_predknn))
#------------------------------naive bayes---------------------------
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=34)
NBmodel = GaussianNB()
NBmodel.fit(X_train, y_train)
trainscore = NBmodel.score(X_train,y_train)
testscore = NBmodel.score(X_test,y_test)
print("Naive Bayes-----------------------------------------------------\n")
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
y_predNB = NBmodel.predict(X_test)
print(confusion_matrix(y_test, y_predNB))
print("f1_score: ",f1_score(y_test, y_predNB),'\n')
print("precision_score: ",precision_score(y_test, y_predNB),'\n')
print("recall_score: ",recall_score(y_test, y_predNB),'\n')
print(classification_report(y_test, y_predNB))
#-------------------------------- XGBoost -------------------------------------
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102)
XGmodel= xgb.XGBRFClassifier()
XGmodel.fit(X_train, y_train)
trainscore = XGmodel.score(X_train,y_train)
testscore = XGmodel.score(X_test,y_test)
print("XGBoost-----------------------------------------------------\n")
print("test score: {} train score: {}".format(testscore,trainscore),'\n')
y_predXG = XGmodel.predict(X_test)
confusion_matrix(y_test, y_pred)
print("f1_score: ",f1_score(y_test, y_predXG),'\n')
print("precision_score: ",precision_score(y_test, y_predXG),'\n')
print("recall_score: ",recall_score(y_test, y_predXG),'\n')
print(classification_report(y_test, y_predXG),'\n')
print("AREA UNDER CURVES-----------------------------------------------------\n")
#-------------------------------------- LogisticRegression -------------------------------------
probabilityValues = classimodel.predict_proba(features)[:,1]
#Calculate AUC
auc = roc_auc_score(label,probabilityValues)
print(auc)
#Calculate roc_curve
fpr,tpr, threshold = roc_curve(label,probabilityValues)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr)
#-------------------------------------- KNeighborsClassifier -------------------------------------
probabilityValues = knnmodel.predict_proba(features)[:,1]
#Calculate AUC
auc = roc_auc_score(label,probabilityValues)
print(auc)
#Calculate roc_curve
fpr,tpr, threshold = roc_curve(label,probabilityValues)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr)
#-------------------------------------- naive bayes -------------------------------------
probabilityValues = NBmodel.predict_proba(features)[:,1]
#Calculate AUC
auc = roc_auc_score(label,probabilityValues)
print(auc)
#Calculate roc_curve
fpr,tpr, threshold = roc_curve(label,probabilityValues)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr)
#-------------------------------------- XGBoost -------------------------------------
probabilityValues = XGmodel.predict_proba(features)[:,1]
#Calculate AUC
auc = roc_auc_score(label,probabilityValues)
print(auc)
#Calculate roc_curve
fpr,tpr, threshold = roc_curve(label,probabilityValues)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr)
'''
#--------------------------------------INTERACE TIME LETS GO BOYS-----------------------
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import matplotlib
matplotlib.use("agg")
model_file_name = 'XG_best_model.joblib'
model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\'
joblib.dump(XGmodel, model_folder+''+model_file_name)
#Loading da model
loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))
print (loaded_XG_model)
def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5):
input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1)
prediction = loaded_XG_model.predict(input_array)
info = ''
if prediction[0] == 0:
info = "You are not currently at risk of a cardiovascular disease! ✅"
else:
info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨"
final_info = "The prediction is: {}".format(info)
print (prediction[0])
return final_info
input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1]
result = make_prediction(*input_values)
print(result)
#------------------------------------------------GRADIO Time lmfao
import gradio as gr
headline = "Cardiovascular Disease Risk Prediction Application"
iface = gr.Interface(fn=make_prediction, inputs=
[gr.inputs.Number(label="Age (Years)"),
gr.inputs.Checkbox(label="I am a male"),
gr.inputs.Number(label="Height (cm)"),
gr.inputs.Number(label="Weight (kg)"),
gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"),
gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"),
gr.inputs.Number(label="Cholesterol (per 20mg/dL)"),
gr.inputs.Number(label="Glucose (per 1 mmol/L)"),
gr.inputs.Checkbox(label="I have smoked."),
gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."),
gr.inputs.Checkbox(label="I am physically active.")
], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft')
if __name__ == "__main__":
iface.launch(share=True)
'''
#--------------------------------------INTERACE TIME LETS GO BOYS-----------------------
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import matplotlib
matplotlib.use("agg")
model_file_name = 'XG_best_model.joblib'
model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\'
joblib.dump(XGmodel, model_folder+''+model_file_name)
#Loading da model
loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))
print (loaded_XG_model)
def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5):
checkbox1 = 1 if "Male" in checkbox1 else 0
input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1)
prediction = loaded_XG_model.predict(input_array)
info = ''
if prediction[0] == 0:
info = "You are not currently at risk of a cardiovascular disease! ✅"
else:
info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨"
final_info = "The prediction is: {}".format(info)
return final_info
#input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1]
#result = make_prediction(*input_values)
#print(result)
#------------------------------------------------GRADIO Time lmfao
import gradio as gr
headline = "Cardiovascular Disease Risk Prediction Application"
iface = gr.Interface(fn=make_prediction, inputs=
[gr.inputs.Number(label="Age (Years)"),
gr.inputs.CheckboxGroup(
label="Gender",
choices=["Male", "Female"],
),
gr.inputs.Number(label="Height (cm)"),
gr.inputs.Number(label="Weight (kg)"),
gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"),
gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"),
gr.inputs.Number(label="Cholesterol (per 20mg/dL)"),
gr.inputs.Number(label="Glucose (per 1 mmol/L)"),
gr.inputs.Checkbox(label="I have smoked."),
gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."),
gr.inputs.Checkbox(label="I am physically active.")
], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft')
if __name__ == "__main__":
iface.launch(share=False)