#importing data analysis libraries import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, roc_auc_score from sklearn.metrics import confusion_matrix ,classification_report,precision_score, recall_score ,f1_score from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC import warnings warnings.filterwarnings('ignore') #data is from https://www.kaggle.com/datasets/thedevastator/exploring-risk-factors-for-cardiovascular-diseas data = pd.read_csv('datasetforriskofcardiodisease.csv') data_num = data[['age','height','weight','ap_hi','ap_lo']] data_cat = data[['gender','cholesterol','gluc','smoke','alco','active']] xaxis = ['Age', 'Height', 'Weight', 'Systolic Blood Pressure', 'Diastolic Blood Pressure'] for i, col in enumerate(data_num.columns): plt.hist(data_num[col]) plt.title(f'Frequency vs. {xaxis[i]}') plt.xlabel(xaxis[i]) plt.ylabel('Frequency') #plt.show() pd.pivot_table(data, index='cardio', values=['age','height','weight','ap_hi','ap_lo']) for i in data_cat.columns: sns.barplot(x=data_cat[i].value_counts().index,y=data_cat[i].value_counts()).set_title(i) #plt.show() #age and categorical variables print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='age')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='gluc', values='age')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='smoke', values='age')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='alco', values='age')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='active', values='age')) #ap_hi (systolic blood pressure) and categorical variables print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_hi')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_hi')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_hi')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_hi')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='active', values='ap_hi')) #ap_low (diastolic blood pressure) and categorical variables print(pd.pivot_table(data,index='cardio',columns='cholesterol', values='ap_lo')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='gluc', values='ap_lo')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='smoke', values='ap_lo')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='alco', values='ap_lo')) print("="*100) print(pd.pivot_table(data,index='cardio',columns='active', values='ap_lo')) for i in data_num.columns: sns.boxplot(data_num[i]) plt.title(i) #plt.show() #Getting interquartile range def outlinefree(dataCol): sorted(dataCol) Q1,Q3 = np.percentile(dataCol,[25,75]) IQR = Q3-Q1 LowerRange = Q1-(1.5 * IQR) UpperRange = Q3+(1.5 * IQR) return LowerRange,UpperRange #Removing outliers lwap_hi,upap_hi = outlinefree(data['ap_hi']) lwap_lo,upap_lo = outlinefree(data['ap_lo']) data['ap_hi'].replace(list(data[data['ap_hi'] > upap_hi].ap_hi) ,upap_hi,inplace=True) data['ap_lo'].replace(list(data[data['ap_lo'] > upap_lo].ap_lo) ,upap_lo,inplace=True) features = data.iloc[:,:-1].values label = data.iloc[:,-1].values #------------------------LogisticRegression----------------------- X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102) classimodel= LogisticRegression() classimodel.fit(X_train, y_train) trainscore = classimodel.score(X_train,y_train) testscore = classimodel.score(X_test,y_test) print("Logistic Regression-----------------------------------------------------\n") print("test score: {} train score: {}".format(testscore,trainscore),'\n') y_pred = classimodel.predict(X_test) #from sklearn.metrics import confusion_matrix confusion_matrix(y_test, y_pred) print(' f1 score: ',f1_score(y_test, y_pred),'\n') print(' precision score: ',precision_score(y_test, y_pred),'\n') print(' recall score: ',recall_score(y_test, y_pred),'\n') print(classification_report(y_test, y_pred)) #--------------------------------------K-Nearest Neighbor(KNN)----------------- X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=193) classifier= KNeighborsClassifier() knnmodel = classifier.fit(X_train, y_train) trainscore = knnmodel.score(X_train,y_train) testscore = knnmodel.score(X_test,y_test) print("KNN-----------------------------------------------------\n") print("test score: {} train score: {}".format(testscore,trainscore),'\n') y_predknn = knnmodel.predict(X_test) print(confusion_matrix(y_test, y_predknn)) print("f1_score: ",f1_score(y_test, y_predknn),'\n') print("precision_score: ",precision_score(y_test, y_predknn),'\n') print("recall_score: ",recall_score(y_test, y_predknn),'\n') print(classification_report(y_test, y_predknn)) #------------------------------naive bayes--------------------------- X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=34) NBmodel = GaussianNB() NBmodel.fit(X_train, y_train) trainscore = NBmodel.score(X_train,y_train) testscore = NBmodel.score(X_test,y_test) print("Naive Bayes-----------------------------------------------------\n") print("test score: {} train score: {}".format(testscore,trainscore),'\n') y_predNB = NBmodel.predict(X_test) print(confusion_matrix(y_test, y_predNB)) print("f1_score: ",f1_score(y_test, y_predNB),'\n') print("precision_score: ",precision_score(y_test, y_predNB),'\n') print("recall_score: ",recall_score(y_test, y_predNB),'\n') print(classification_report(y_test, y_predNB)) #-------------------------------- XGBoost ------------------------------------- import xgboost as xgb from sklearn.metrics import mean_squared_error import pandas as pd import numpy as np X_train, X_test, y_train, y_test= train_test_split(features,label, test_size= 0.25, random_state=102) XGmodel= xgb.XGBRFClassifier() XGmodel.fit(X_train, y_train) trainscore = XGmodel.score(X_train,y_train) testscore = XGmodel.score(X_test,y_test) print("XGBoost-----------------------------------------------------\n") print("test score: {} train score: {}".format(testscore,trainscore),'\n') y_predXG = XGmodel.predict(X_test) confusion_matrix(y_test, y_pred) print("f1_score: ",f1_score(y_test, y_predXG),'\n') print("precision_score: ",precision_score(y_test, y_predXG),'\n') print("recall_score: ",recall_score(y_test, y_predXG),'\n') print(classification_report(y_test, y_predXG),'\n') print("AREA UNDER CURVES-----------------------------------------------------\n") #-------------------------------------- LogisticRegression ------------------------------------- probabilityValues = classimodel.predict_proba(features)[:,1] #Calculate AUC auc = roc_auc_score(label,probabilityValues) print(auc) #Calculate roc_curve fpr,tpr, threshold = roc_curve(label,probabilityValues) plt.plot([0,1],[0,1], linestyle = '--') plt.plot(fpr,tpr) #-------------------------------------- KNeighborsClassifier ------------------------------------- probabilityValues = knnmodel.predict_proba(features)[:,1] #Calculate AUC auc = roc_auc_score(label,probabilityValues) print(auc) #Calculate roc_curve fpr,tpr, threshold = roc_curve(label,probabilityValues) plt.plot([0,1],[0,1], linestyle = '--') plt.plot(fpr,tpr) #-------------------------------------- naive bayes ------------------------------------- probabilityValues = NBmodel.predict_proba(features)[:,1] #Calculate AUC auc = roc_auc_score(label,probabilityValues) print(auc) #Calculate roc_curve fpr,tpr, threshold = roc_curve(label,probabilityValues) plt.plot([0,1],[0,1], linestyle = '--') plt.plot(fpr,tpr) #-------------------------------------- XGBoost ------------------------------------- probabilityValues = XGmodel.predict_proba(features)[:,1] #Calculate AUC auc = roc_auc_score(label,probabilityValues) print(auc) #Calculate roc_curve fpr,tpr, threshold = roc_curve(label,probabilityValues) plt.plot([0,1],[0,1], linestyle = '--') plt.plot(fpr,tpr) ''' #--------------------------------------INTERACE TIME LETS GO BOYS----------------------- from sklearn.feature_extraction.text import CountVectorizer import joblib import matplotlib matplotlib.use("agg") model_file_name = 'XG_best_model.joblib' model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\' joblib.dump(XGmodel, model_folder+''+model_file_name) #Loading da model loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb')) print (loaded_XG_model) def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5): input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1) prediction = loaded_XG_model.predict(input_array) info = '' if prediction[0] == 0: info = "You are not currently at risk of a cardiovascular disease! ✅" else: info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨" final_info = "The prediction is: {}".format(info) print (prediction[0]) return final_info input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1] result = make_prediction(*input_values) print(result) #------------------------------------------------GRADIO Time lmfao import gradio as gr headline = "Cardiovascular Disease Risk Prediction Application" iface = gr.Interface(fn=make_prediction, inputs= [gr.inputs.Number(label="Age (Years)"), gr.inputs.Checkbox(label="I am a male"), gr.inputs.Number(label="Height (cm)"), gr.inputs.Number(label="Weight (kg)"), gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"), gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"), gr.inputs.Number(label="Cholesterol (per 20mg/dL)"), gr.inputs.Number(label="Glucose (per 1 mmol/L)"), gr.inputs.Checkbox(label="I have smoked."), gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."), gr.inputs.Checkbox(label="I am physically active.") ], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft') if __name__ == "__main__": iface.launch(share=True) ''' #--------------------------------------INTERACE TIME LETS GO BOYS----------------------- from sklearn.feature_extraction.text import CountVectorizer import joblib import matplotlib matplotlib.use("agg") model_file_name = 'XG_best_model.joblib' model_folder = 'C:\\Users\\Ben Z\\Downloads\\Models\\' joblib.dump(XGmodel, model_folder+''+model_file_name) #Loading da model loaded_XG_model = joblib.load(open(model_folder+''+model_file_name, 'rb')) print (loaded_XG_model) def make_prediction(value1, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5): checkbox1 = 1 if "Male" in checkbox1 else 0 input_array = np.array([value1*365.25, checkbox1, value2, value3, value4, value5, value6, value7, checkbox3, checkbox4, checkbox5]).reshape(1, -1) prediction = loaded_XG_model.predict(input_array) info = '' if prediction[0] == 0: info = "You are not currently at risk of a cardiovascular disease! ✅" else: info = "You are at risk of a cardiovascular disease. I would recommend going to the doctor however, take my advice with a grain of salt as I am an AI model capable of making mistakes. 🚨" final_info = "The prediction is: {}".format(info) return final_info #input_values = [50.3572895, 1, 168, 62, 110, 80, 1, 1, 0, 0, 1] #result = make_prediction(*input_values) #print(result) #------------------------------------------------GRADIO Time lmfao import gradio as gr headline = "Cardiovascular Disease Risk Prediction Application" iface = gr.Interface(fn=make_prediction, inputs= [gr.inputs.Number(label="Age (Years)"), gr.inputs.CheckboxGroup( label="Gender", choices=["Male", "Female"], ), gr.inputs.Number(label="Height (cm)"), gr.inputs.Number(label="Weight (kg)"), gr.inputs.Number(label="Systolic Blood Pressure (mmHg)"), gr.inputs.Number(label="Diastolic Blood Pressure (mmHg)"), gr.inputs.Number(label="Cholesterol (per 20mg/dL)"), gr.inputs.Number(label="Glucose (per 1 mmol/L)"), gr.inputs.Checkbox(label="I have smoked."), gr.inputs.Checkbox(label="I drink more alcohol than I should (>2 cups for men and >1 cup for women)."), gr.inputs.Checkbox(label="I am physically active.") ], outputs=gr.outputs.Textbox(label="Prediction Result"), title=headline, theme='soft') if __name__ == "__main__": iface.launch(share=False)