import pickle import pandas as pd import sklearn import numpy as np import gradio as gr import imblearn import pandas as pd import seaborn as sns import sklearn from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix,plot_confusion_matrix,classification_report from imblearn.over_sampling import RandomOverSampler import numpy as np from sklearn.preprocessing import StandardScaler,OneHotEncoder from sklearn.compose import make_column_transformer import warnings warnings.filterwarnings("ignore") import tensorflow as tf from tensorflow.keras.applications import imagenet_utils from tensorflow.keras.utils import img_to_array from tensorflow.keras.models import load_model import cv2 import pdfplumber import re from collections import namedtuple # Load Data def loading_data(): df = pd.read_csv("diabetes_012__health_indicators_BRFSS2015.csv") #convert all columns to integer for col in df.columns: df[col] = df[col].astype("int") # Drop duplicated rows df_ = df.drop_duplicates() df=df_ X,y = df.drop(['Diabetes_012'],axis=1),df['Diabetes_012'].values classes = np.unique(y) #Oversampling data randomSampler = RandomOverSampler(sampling_strategy='all',random_state=24) X_new,y_new = randomSampler.fit_resample(X,y) new_df = X_new.copy() new_df["Diabetes_012"] = y_new Xtrain,Xtest,Ytrain,Ytest = train_test_split(X_new,y_new,test_size=0.2,random_state=24,stratify=y_new) # Some feature engineering Xtrain_transf = Xtrain.copy() Xtrain_transf["Age2"] = Xtrain_transf["Age"]**2 ## Numerical column transformation col_num=["BMI","MentHlth","PhysHlth","Age","Age2"] num_col_trans = make_column_transformer((StandardScaler(),col_num),remainder="passthrough") Xtrain_transf_std = num_col_trans.fit_transform(Xtrain_transf,Ytrain) Xtrain_transf_std = pd.DataFrame(Xtrain_transf_std ,columns=list(Xtrain_transf.columns) ) # Données tests: Xtest_transf = Xtest.copy() Xtest_transf["Age2"] = Xtest_transf["Age"]**2 Xtest_transf_std = num_col_trans.transform(Xtest_transf) Xtest_transf_std = pd.DataFrame(Xtest_transf_std ,columns=list(Xtest_transf.columns) ) ## Categorical columns transformation col_cat = ["GenHlth","Education","Income"] cat_col_trans = make_column_transformer((OneHotEncoder(handle_unknown = 'ignore'),col_cat),remainder="passthrough") Xtrain_transf_std_encoded = cat_col_trans.fit_transform(Xtrain_transf_std,Ytrain) # Sur les données tests : Xtest_encoded = cat_col_trans.transform(Xtest_transf_std) results = {"Xtrain": Xtrain, "Ytrain": Ytrain, "Xtest_encoded": Xtest_encoded, "Ytest": Ytest, "num_col_trans": num_col_trans, "cat_col_trans": cat_col_trans} return results # Get PDF File for user def get_data_inputPDF(PDF_file): path_file = PDF_file.name with pdfplumber.open(path_file) as pdf: num_page = 0 page = pdf.pages[num_page] text = page.extract_text(x_tolerance=2, y_tolerance=0) AnyHealthcare_pattern = re.compile('^(AnyHealthcare)\s+(\w{3})') HighBP_pattern = re.compile('^(HighBP)\s+(\w{3})') NoDocbcCost_pattern = re.compile('^(NoDocbcCost)\s+(\w{3})') HighChol_pattern = re.compile('^(HighChol)\s+(\w{3})') GenHlth_pattern = re.compile('^(GenHlth)\s+(\d+)') CholCheck_pattern = re.compile('^(CholCheck)\s+(\w{3})') MentHlth_pattern = re.compile('^(MentHlth)\s+(\d+)') BMI_pattern = re.compile('^(BMI)\s+(\d+)') PhysHlth_pattern = re.compile('^(PhysHlth)\s+(\d+)') Smoker_pattern = re.compile('^(Smoker)\s+(\w{3})') DiffWalk_pattern = re.compile('^(DiffWalk)\s+(\w{3})') Stroke_pattern = re.compile('^(Stroke)\s+(\w{3})') Sex_pattern = re.compile('^(Sex)\s+(\w+)') HeartDiseaseorAttack_pattern = re.compile('^(HeartDiseaseorAttack)\s+(\w{3})') Age_pattern = re.compile('^(Age)\s+(\d+)') PhysActivity_pattern = re.compile('^(PhysActivity)\s+(\w{3})') Education_pattern = re.compile('^(Education)\s+(\d+)') Fruits_pattern = re.compile('^(Fruits)\s+(\w{3})') Income_pattern = re.compile('^(Income)\s+(\d+)') Veggies_pattern = re.compile('^(Veggies)\s+(\w{3})') HvyAlcoholConsump_pattern = re.compile('^(HvyAlcoholConsump)\s+(\w{3})') for line in text.split("\n"): AnyHealthcare_ligne = AnyHealthcare_pattern.search(line) HighBP_ligne = HighBP_pattern.search(line) NoDocbcCost_ligne = NoDocbcCost_pattern.search(line) HighChol_ligne = HighChol_pattern.search(line) GenHlth_ligne = GenHlth_pattern.search(line) CholCheck_ligne = CholCheck_pattern.search(line) MentHlth_ligne = MentHlth_pattern.search(line) BMI_ligne = BMI_pattern.search(line) PhysHlth_ligne = PhysHlth_pattern.search(line) Smoker_ligne = Smoker_pattern.search(line) DiffWalk_ligne = DiffWalk_pattern.search(line) Stroke_ligne = Stroke_pattern.search(line) Sex_ligne = Sex_pattern.search(line) HeartDiseaseorAttack_ligne = HeartDiseaseorAttack_pattern.search(line) Age_ligne = Age_pattern.search(line) PhysActivity_ligne = PhysActivity_pattern.search(line) Education_ligne = Education_pattern.search(line) Fruits_ligne = Fruits_pattern.search(line) Income_ligne = Income_pattern.search(line) Veggies_ligne = Veggies_pattern.search(line) HvyAlcoholConsump_ligne = HvyAlcoholConsump_pattern.search(line) if AnyHealthcare_ligne: AnyHealthcare = AnyHealthcare_ligne.group(2) if HvyAlcoholConsump_ligne: HvyAlcoholConsump = HvyAlcoholConsump_ligne.group(2) if HighBP_ligne: HighBP = HighBP_ligne.group(2) if NoDocbcCost_ligne: NoDocbcCost = NoDocbcCost_ligne.group(2) if HighChol_ligne: HighChol= HighChol_ligne.group(2) if GenHlth_ligne: GenHlth = GenHlth_ligne.group(2) if CholCheck_ligne: CholCheck = CholCheck_ligne.group(2) if MentHlth_ligne: MentHlth = MentHlth_ligne.group(2) if BMI_ligne: BMI = BMI_ligne.group(2) if PhysHlth_ligne: PhysHlth = PhysHlth_ligne.group(2) if Smoker_ligne: Smoker = Smoker_ligne.group(2) if DiffWalk_ligne: DiffWalk = DiffWalk_ligne.group(2) if Stroke_ligne: Stroke = Stroke_ligne.group(2) if Sex_ligne: Sex = Sex_ligne.group(2) if HeartDiseaseorAttack_ligne: HeartDiseaseorAttack = HeartDiseaseorAttack_ligne.group(2) if Age_ligne: Age = Age_ligne.group(2) if PhysActivity_ligne: PhysActivity = PhysActivity_ligne.group(2) if Education_ligne: Education = Education_ligne.group(2) if Fruits_ligne: Fruits = Fruits_ligne.group(2) if Income_ligne: Income = Income_ligne.group(2) if Veggies_ligne: Veggies = Veggies_ligne.group(2) tab = namedtuple('table','HighBP HighChol CholCheck BMI Smoker Stroke HeartDiseaseorAttack \ PhysActivity Fruits Veggies HvyAlcoholConsump AnyHealthcare NoDocbcCost GenHlth \ MentHlth PhysHlth DiffWalk \ Sex Age Education Income') data_unpreprared = pd.DataFrame([tab(HighBP,HighChol, CholCheck, BMI, Smoker,Stroke,HeartDiseaseorAttack, PhysActivity, Fruits, Veggies,HvyAlcoholConsump,AnyHealthcare, NoDocbcCost, GenHlth, MentHlth, PhysHlth, DiffWalk, Sex, Age, Education,Income)]) # Preprare data type AnyHealthcare = 0 if AnyHealthcare=="Non" else 1 HvyAlcoholConsump = 0 if HvyAlcoholConsump=="Non" else 1 HighBP = 0 if HighBP=="Non" else 1 NoDocbcCost = 0 if NoDocbcCost=="Non" else 1 HighChol= 0 if HighChol=="Non" else 1 GenHlth = int(GenHlth) CholCheck = 0 if CholCheck=="Non" else 1 MentHlth = int(MentHlth) BMI = int(BMI) PhysHlth = int(PhysHlth) Smoker = 0 if Smoker=="Non" else 1 DiffWalk = 0 if DiffWalk=="Non" else 1 Stroke = 0 if Stroke=="Non" else 1 Sex = 0 if Sex=="Homme" else 1 HeartDiseaseorAttack = 0 if HeartDiseaseorAttack=="Non" else 1 Age = int(Age) PhysActivity = 0 if PhysActivity=="Non" else 1 Education = int(Education) Fruits = 0 if Fruits=="Non" else 1 Income = int(Income) Veggies = 0 if Veggies=="Non" else 1 user_df = pd.DataFrame([tab(HighBP,HighChol, CholCheck, BMI, Smoker,Stroke,HeartDiseaseorAttack, PhysActivity, Fruits, Veggies,HvyAlcoholConsump,AnyHealthcare, NoDocbcCost, GenHlth, MentHlth, PhysHlth, DiffWalk, Sex, Age, Education,Income)]) ## Load Model from pickle file with open("Model_package.pkl","rb") as f: Model_package = pickle.load(f) model = Model_package['my_classif'] #LOADING THE DATA results = loading_data() num_col_trans = results["num_col_trans"] cat_col_trans = results["cat_col_trans"] # Data preparation before fitting to the model user_df["Age2"] = user_df["Age"]**2 user_df_std = num_col_trans.transform(user_df) user_df_std = pd.DataFrame(user_df_std ,columns=list(user_df.columns) ) user_df_encoded = cat_col_trans.transform(user_df_std) user_encoded = np.array(user_df_encoded).reshape((1,-1)) #PREDICTION target_names = ['No diabetes', 'Prediabetes', 'Diabetes'] pred_proba = model.predict_proba(user_encoded)[0] results = {classe : pred_proba[i] for i, classe in enumerate(target_names)} max_prob = 0 ma_classe = "" for k,v in results.items(): if v > max_prob: max_prob= v ma_classe= k STYLE = """ """ OUTPUT_OK = ( STYLE + f"""