File size: 4,121 Bytes
712671d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding: utf-8 -*-
"""diarc.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Jyccp5Aeml-7oZABbACY2VTE9iQJg9Pe

# Bismillahir Rahmaanir Raheem
# Almadadh Ya Gause RadiAllahu Ta'alah Anh - Ameen

# <font color=grey>DIabetes-related Amputation Risk Calculator (DIARC)</font>
<b>_by Zakia Salod_</b>
"""

!pip install pycaret

from pycaret.utils import version
version()

from pycaret.utils import enable_colab
enable_colab()

import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # For graphical representations of the data
import seaborn as sns

# Just to make sure the results are reproducible
np.random.seed(1234)

dataset = pd.read_excel('amputation_dataset.xlsx')

print(dataset['AMPUTATION'].value_counts())

ax = sns.countplot(x="AMPUTATION", data=dataset)

# show the number of duplicate rows in this dataset
dataset.duplicated(keep='first').sum()

# remove the duplicate rows in this dataset
# only keep the first instance of the row
dataset = dataset.drop_duplicates(keep='first')

print(dataset['AMPUTATION'].value_counts())

ax = sns.countplot(x="AMPUTATION", data=dataset)

dataset.head()

# Under sample the dataset to handle the imbalance
# Shuffle the Dataset.
shuffled_dataset = dataset.sample(frac=1, random_state=4)

# Put all the amputation class in a separate dataset.
amputation_dataset = shuffled_dataset.loc[shuffled_dataset['AMPUTATION'] == 1]


#Randomly select 105 observations from the non-amputation (majority class)
non_amputation_dataset = shuffled_dataset.loc[shuffled_dataset['AMPUTATION'] == 0].sample(n=105,random_state=42)

# Concatenate both dataframes again
dataset = pd.concat([amputation_dataset, non_amputation_dataset])

print(dataset['AMPUTATION'].value_counts())

ax = sns.countplot(x="AMPUTATION", data=dataset)

dataset.to_excel('amputation_removed_duplicates_and_balanced.xlsx')

from pycaret.classification import *

clf = setup(data = dataset, target = 'AMPUTATION',  session_id = 42)

# display the dataset (X_train)
get_config('X_train')
# converts age from numeric to float
# converts gender and diabetes_class (the two binary category variables) into label encoder conversion
# so, gender_f ---> with value 1 indicating FEMALE is TRUE and value 0 indicating FEMALE is FALSE (and instead, MALE)
# diabetes_class type 1 diabetes ---> value 1 indicates diabetes type 1 and value 0 means diabetes type 2
# then, one hot encoding is applied to the race column (each race is split into separate columns, with value 1 denoting TRUE for that race)

# display the dataset (y_train)
get_config('y_train')

best_model = compare_models(sort = 'AUC')

# BLEND MODELS, ALHUM
# create models for blending
nb = create_model('nb')
bagged_nb = ensemble_model(nb, method='Bagging')
lr = create_model('lr')
bagged_lr = ensemble_model(lr, method='Bagging')
lda = create_model('lda')
bagged_lda = ensemble_model(lda, method='Bagging')

rf = create_model('rf')
bagged_rf = ensemble_model(rf, method='Bagging')
ada = create_model('ada')
bagged_ada = ensemble_model(ada, method='Bagging')


blend_specific = blend_models(estimator_list = [bagged_nb, bagged_lr, bagged_lda, bagged_rf, bagged_ada])

# plot model
plot_model(blend_specific)

# tuning
tuned_blend_specific = tune_model(blend_specific)

evaluate_model(tuned_blend_specific)

tuned_blend_specific_predictions = predict_model(tuned_blend_specific)

# finalize model for deployment
final_tuned_blend_specific = finalize_model(tuned_blend_specific)

# save the model
# creates a .pkl file
save_model(tuned_blend_specific, "tuned_blend_specific_model_19112021", verbose=True)

# display the dataset (X_test)
get_config('X_test')

# display the dataset (y_test)
get_config('y_test')

dataset2 = pd.read_excel('amputation_removed_duplicates_and_balanced.xlsx')

!pip install pandas-profiling

from pandas_profiling import ProfileReport

profile = ProfileReport(dataset2, title="Pandas Profiling Report")

profile.to_file("amputation_removed_duplicates_and_balanced_report.html")