File size: 4,121 Bytes
712671d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# -*- coding: utf-8 -*-
"""diarc.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Jyccp5Aeml-7oZABbACY2VTE9iQJg9Pe
# Bismillahir Rahmaanir Raheem
# Almadadh Ya Gause RadiAllahu Ta'alah Anh - Ameen
# <font color=grey>DIabetes-related Amputation Risk Calculator (DIARC)</font>
<b>_by Zakia Salod_</b>
"""
!pip install pycaret
from pycaret.utils import version
version()
from pycaret.utils import enable_colab
enable_colab()
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # For graphical representations of the data
import seaborn as sns
# Just to make sure the results are reproducible
np.random.seed(1234)
dataset = pd.read_excel('amputation_dataset.xlsx')
print(dataset['AMPUTATION'].value_counts())
ax = sns.countplot(x="AMPUTATION", data=dataset)
# show the number of duplicate rows in this dataset
dataset.duplicated(keep='first').sum()
# remove the duplicate rows in this dataset
# only keep the first instance of the row
dataset = dataset.drop_duplicates(keep='first')
print(dataset['AMPUTATION'].value_counts())
ax = sns.countplot(x="AMPUTATION", data=dataset)
dataset.head()
# Under sample the dataset to handle the imbalance
# Shuffle the Dataset.
shuffled_dataset = dataset.sample(frac=1, random_state=4)
# Put all the amputation class in a separate dataset.
amputation_dataset = shuffled_dataset.loc[shuffled_dataset['AMPUTATION'] == 1]
#Randomly select 105 observations from the non-amputation (majority class)
non_amputation_dataset = shuffled_dataset.loc[shuffled_dataset['AMPUTATION'] == 0].sample(n=105,random_state=42)
# Concatenate both dataframes again
dataset = pd.concat([amputation_dataset, non_amputation_dataset])
print(dataset['AMPUTATION'].value_counts())
ax = sns.countplot(x="AMPUTATION", data=dataset)
dataset.to_excel('amputation_removed_duplicates_and_balanced.xlsx')
from pycaret.classification import *
clf = setup(data = dataset, target = 'AMPUTATION', session_id = 42)
# display the dataset (X_train)
get_config('X_train')
# converts age from numeric to float
# converts gender and diabetes_class (the two binary category variables) into label encoder conversion
# so, gender_f ---> with value 1 indicating FEMALE is TRUE and value 0 indicating FEMALE is FALSE (and instead, MALE)
# diabetes_class type 1 diabetes ---> value 1 indicates diabetes type 1 and value 0 means diabetes type 2
# then, one hot encoding is applied to the race column (each race is split into separate columns, with value 1 denoting TRUE for that race)
# display the dataset (y_train)
get_config('y_train')
best_model = compare_models(sort = 'AUC')
# BLEND MODELS, ALHUM
# create models for blending
nb = create_model('nb')
bagged_nb = ensemble_model(nb, method='Bagging')
lr = create_model('lr')
bagged_lr = ensemble_model(lr, method='Bagging')
lda = create_model('lda')
bagged_lda = ensemble_model(lda, method='Bagging')
rf = create_model('rf')
bagged_rf = ensemble_model(rf, method='Bagging')
ada = create_model('ada')
bagged_ada = ensemble_model(ada, method='Bagging')
blend_specific = blend_models(estimator_list = [bagged_nb, bagged_lr, bagged_lda, bagged_rf, bagged_ada])
# plot model
plot_model(blend_specific)
# tuning
tuned_blend_specific = tune_model(blend_specific)
evaluate_model(tuned_blend_specific)
tuned_blend_specific_predictions = predict_model(tuned_blend_specific)
# finalize model for deployment
final_tuned_blend_specific = finalize_model(tuned_blend_specific)
# save the model
# creates a .pkl file
save_model(tuned_blend_specific, "tuned_blend_specific_model_19112021", verbose=True)
# display the dataset (X_test)
get_config('X_test')
# display the dataset (y_test)
get_config('y_test')
dataset2 = pd.read_excel('amputation_removed_duplicates_and_balanced.xlsx')
!pip install pandas-profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(dataset2, title="Pandas Profiling Report")
profile.to_file("amputation_removed_duplicates_and_balanced_report.html") |