##### Import libraries necessary for this project
import numpy as np
import pandas as pd
# time ML models take to execute to train different models
from time import time
# Allows the use of display() for DataFrames
from IPython.display import display
import matplotlib.pyplot as plt
import sklearn
# Pretty display for notebooks
# Load the diabetes dataset
data = pd.read_csv("diabetes_prediction_dataset.csv")
# Success - Display records of first 5 data line
df = pd.DataFrame(data)
# Filter the DataFrame
df = df[df['age'] >= 10]
from sklearn.preprocessing import LabelEncoder
# Convert gender to binary format
gender_encoder = LabelEncoder()
df['gender'] = gender_encoder.fit_transform(df['gender'])
# Convert smoking history to integer format
smoking_mapping = {
'never': 0,
'No Info': 1,
'current': 2,
'former': 3,
'ever': 4,
'not current': 5
df['smoking_history'] = df['smoking_history'].map(smoking_mapping)
# Display the updated DataFrame
#Total number of records
n_records = len(df.index)
#Number of records where outcome = 1
n_1 = len(df[df.diabetes == 1])
#Number of records where outcome = 0
data[data.Outcome == 0], it gives a tuple with the dimension of matrix i.e; (3,2)
This extracts the first element of the tuple returned by .shape, which is the
number of rows in the DataFrame.
n_0 = df[df.diabetes == 0].shape[0]
#Percentage of individuals whose Outcome is 1
n1_perc = (n_1/n_records) * 100
# Print the results
print("Total number of records: {}".format(n_records))
print("Number of persons diagonised with diabetes : {}".format(n_1))
print("Number of persons not having diabetes : {}".format(n_0))
print("Percentage of people who are Diabetic : {}%".format(n1_perc))
# Splitting into features (X) and target label (y)
features_final = df.drop('diabetes', axis=1) # Features (excluding 'diabetes')
outcome_r = df['diabetes'] # Target label ('diabetes')
### Visualizing Skewed Continuous Features
# Setting up figure for subplots
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
# Plotting each feature
for ax, column in zip(axes.flatten(), features_final.columns):
ax.hist(features_final[column], bins=25, color='skyblue', edgecolor='black', linewidth=1.2)
ax.set_title(f"{column} Distribution")
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Separate features and target
X = df.drop('diabetes', axis=1)
y = df['diabetes']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
# Train a model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_sm, y_train_sm)
# Predict on the test set
y_pred = model.predict(X_test)
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))
# Import train_test_split
from sklearn.model_selection import train_test_split
# Split the 'features' and 'income' data into training and testing sets
When splitting a dataset into training and testing sets, setting a
random_state ensures that the split is the same every time you run the code.
X_train, X_test, y_train, y_test = train_test_split(features_final,
test_size = 0.2,
random_state = 42)
# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))
#Import two metrics from sklearn - fbeta_score and accuracy_score
The F-score, or F1-score, is a measure of a model's accuracy on a dataset. It is a weighted
average of precision and recall. The F1-score is the harmonic mean of precision and recall,
giving equal importance to both.
fbeta_score(y_true, y_pred, pos_label=1, average='binary', beta=0.5):
y_true: True labels.
y_pred: Predicted labels.
pos_label: The positive class label.
average: Method to calculate the F-score for binary or multiclass classification.
beta: Weight of recall in the F-score.
The beta parameter determines the weight of recall in the F-score.
When beta is 1, the F-score is the harmonic mean of precision and recall (F1-score).
When beta is less than 1 (e.g., beta=0.5), precision is given more weight.
When beta is greater than 1, recall is given more weight.
from time import time
from sklearn.metrics import fbeta_score,accuracy_score
def train_predict(learner, sample_size, X_train_sm, y_train_sm, X_test, y_test):
- learner: the learning algorithm to be trained and predicted on
- sample_size: the size of samples (number) to be drawn from training set
- X_train: features training set
- y_train: income training set
- X_test: features testing set
- y_test: income testing set
results = {}
#Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
start = time()
learner = learner.fit(X_train_sm[:sample_size],y_train_sm[:sample_size])
end = time()
results['train_time'] = end - start
# Get the predictions on the test set(X_test),
# then get predictions on the first 300 training samples(X_train) using .predict()
start = time()
predictions_test = learner.predict(X_test)
predictions_train = learner.predict(X_train_sm[:300])
end = time()
results['pred_time'] = end - start
# Compute accuracy on the first 300 training samples which is y_train[:300]
results['acc_train'] = accuracy_score(y_train_sm[:300], predictions_train)
# Compute accuracy on test set using accuracy_score()
results['acc_test'] = accuracy_score(y_test,predictions_test)
# Compute F-score on the the first 300 training samples using fbeta_score()
results['f_train'] = fbeta_score(y_train_sm[:300],predictions_train,pos_label=1, average= 'binary',beta =0.5)
# Compute F-score on the test set which is y_test
results['f_test'] = fbeta_score(y_test,predictions_test,pos_label=1, average= 'binary',beta =0.5)
# Success
print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
# Return the results
return results
#Import the three supervised learning models from sklearn
LogisticRegression: A linear model for binary classification.
RandomForestClassifier: An ensemble method that uses multiple decision trees and
averages their predictions.
AdaBoostClassifier: An ensemble method that combines multiple weak classifiers
(like decision trees) to create a strong classifier.
DecisionTreeClassifier: A decision tree classifier, used as a base estimator in
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
#Initialize the three models
clf_A = LogisticRegression(random_state=42)
clf_B = RandomForestClassifier()
clf_C = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),random_state=42)
#Calculate the number of samples for 1%, 10%, and 100% of the training data
#samples_100 is the entire training set i.e. len(y_train)
#samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
#samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
samples_100 = len(y_train)
samples_10 = int(0.1 * samples_100)
samples_1 = int(0.01 * samples_100)
# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = \
train_predict(clf, samples, X_train, y_train, X_test, y_test)
# Imports the module mpatches from matplotlib, which is used to create legend patches.
import matplotlib.patches as mpatches
def evaluate(results, accuracy, f1):
Visualization code to display results of various learners.
- learners: a list of supervised learners
- stats: a list of dictionaries of the statistic results from 'train_predict()'
- accuracy: The score for the naive predictor
- f1: The score for the naive predictor
# Create figure
fig, ax = plt.subplots(2, 3, figsize = (11,11))
# Constants
bar_width = 0.3
colors = ['#A00000','#00A0A0','#00A000']
# Super loop to plot four panels of data
# iterates through each learner/model in results
for k, learner in enumerate(results.keys()):
# Iterates through each metric (like training time, accuracy, etc.) to be plotted.
for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):
# Iterates over the three different sample sizes (1%, 10%, and 100%).
for i in np.arange(3):
# Creative plot code
// operator performs integer (or floor) division
j//3: Integer division of j by 3 gives the row index (0 or 1).
j%3: Modulo operation gives the column index (0, 1, or 2).
The x-position of the bar. i is the index of the training set size (0, 1, or 2 for "1%", "10%", "100%").
k*bar_width offsets the bars of different learners/models so they don’t overlap.
ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45])
ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"])
ax[j//3, j%3].set_xlabel("Training Set Size")
ax[j//3, j%3].set_xlim((-0.1, 3.0))
# Add unique y-labels
ax[0, 0].set_ylabel("Time (in seconds)")
ax[0, 1].set_ylabel("Accuracy Score")
ax[0, 2].set_ylabel("F-score")
ax[1, 0].set_ylabel("Time (in seconds)")
ax[1, 1].set_ylabel("Accuracy Score")
ax[1, 2].set_ylabel("F-score")
# Add titles
ax[0, 0].set_title("Model Training")
ax[0, 1].set_title("Accuracy Score on Training Subset")
ax[0, 2].set_title("F-score on Training Subset")
ax[1, 0].set_title("Model Predicting")
ax[1, 1].set_title("Accuracy Score on Testing Set")
ax[1, 2].set_title("F-score on Testing Set")
# Add horizontal lines for naive predictors
The first and second subplots in the second column (accuracy plots)
get a horizontal line at the accuracy value.
The first and second subplots in the third column (F1 score plots) get a horizontal line
at the f1 value.
ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
# Set y-limits for score panels
ax[0, 1].set_ylim((0, 1))
ax[0, 2].set_ylim((0, 1))
ax[1, 1].set_ylim((0, 1))
ax[1, 2].set_ylim((0, 1))
# Create patches for the legend
patches = []
for i, learner in enumerate(results.keys()):
patches.append(mpatches.Patch(color = colors[i], label = learner))
plt.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'large')
# Aesthetics
plt.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 10, y = 1.10)
plt.tight_layout(pad = 8)
#Calculate accuracy, precision and recall
accuracy = n_1/n_records
precision = n_1/n_records
recall = np.sum(outcome_r)/np.sum(outcome_r)
# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
fscore = (1+np.square(0.5))*precision*recall/((np.square(0.5)*precision)+recall)
# Print the results
print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))
evaluate(results, accuracy, fscore)
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
GridSearchCV is a tool provided by sklearn for hyperparameter tuning. It performs an exhaustive
search over a specified parameter grid to find the optimal hyperparameters for a given model.
from sklearn.model_selection import GridSearchCV
make_scorer is used to convert a metric function into a scorer object that can be used by
GridSearchCV and other tools that require a scoring function.
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score
#Initialize the classifier
clf = RandomForestClassifier(random_state=42)
parameters = {'n_estimators':[5,10,15,20,25],'max_depth': [2, 4, 6, 8, 10]}
#Creating the fbeta_score and accuracy_score scoring objects
scorer = make_scorer(fbeta_score, beta=0.5)
acc_scorer = make_scorer(accuracy_score)
#Perform grid search on classifier using GridSearchCV()
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
#Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)
Cross-Validation: The training data is split into k folds (default is usually 5),
and the model is trained on k-1 folds and validated on the remaining fold. This process is
repeated k times, each time with a different fold as the validation set.
Hyperparameter Combinations: Each combination of hyperparameters is tested by training the model on
the training data and evaluating its performance on the validation data.
Scoring: For each fold, the model's performance is evaluated using the specified scoring method
(fbeta_score with beta=0.5 in this case).
Average Score: The average score across all folds is calculated for each hyperparameter combination.
Best Parameters: The combination of hyperparameters with the highest average score is selected as
the best.
#Get the best estimator for classifier
best_clf = grid_fit.best_estimator_
#Make predictions using the unoptimized and optimized classifiers
predictions = clf.fit(X_train, y_train).predict(X_test)
best_predictions = best_clf.predict(X_test)
#Print the results
print("Random Forest")
print("Unoptimized model accuracy: {:.4f}".format(accuracy_score(y_test, predictions)))
print("Optimized model accuracy: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Unoptimized model F-score: {:.4f}".format(fbeta_score(y_test, predictions, beta=0.5)))
print("Optimized model F-score: {:.4f}".format(fbeta_score(y_test, best_predictions, beta=0.5)))
def feature_plot(importances, X_train, y_train):
# Display the three most important features
# indices me indexes aa rhe hai lol argsort gives indexes and sort gives the values
indices = np.argsort(importances)[::-1]
# this will extract the names of columns
columns = X_train.columns.values[indices[:5]]
# yeh to value of impostance of top 3 columns dera hai
values = importances[indices][:5]
# Creat the plot
fig = plt.figure(figsize=(10, 6))
plt.title("Normalized Weights for First Five Most Predictive Features", fontsize=16)
# Plotting feature weights
plt.bar(np.arange(5), values, width=0.4, align="center", color='#00A000', label="Feature Weight")
# Plotting cumulative feature weights
plt.bar(np.arange(5) - 0.2, np.cumsum(values), width=0.4, align="center", color='#00A0A0', label="Cumulative Feature Weight")
plt.xticks(np.arange(5), columns, rotation=45)
plt.xlabel("Feature", fontsize=12)
plt.ylabel("Weight", fontsize=12)
plt.legend(loc='upper center')
#Extracting important features
#Import a supervised learning model that has 'feature_importances_'
from sklearn.ensemble import RandomForestClassifier
#Train the supervised model on the training set using .fit(X_train, y_train)
model = best_clf
#Extract the feature importances using .feature_importances_
importances = model.feature_importances_
# Plot
feature_plot(importances, X_train, y_train)
# Import functionality for cloning a model
from sklearn.base import clone
# Reduce the feature space
X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]
# Train on the "best" model found from grid search earlier
clf = (clone(best_clf)).fit(X_train_reduced, y_train)
# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)
# Report scores from the final model using both versions of data
print("Final Model trained on full data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
print("\nFinal Model trained on reduced data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5)))
import numpy as np
# Define the prediction function
def predict_diabetes(gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level):
# Map the input values as per your preprocessing
smoking_mapping = {
'never': 0,
'No Info': 1,
'current': 2,
'former': 3,
'ever': 4,
'not current': 5
# Prepare the input data in the same format as the training data
input_data = np.array([[gender, age, hypertension, heart_disease, smoking_mapping[smoking_history], bmi, HbA1c_level, blood_glucose_level]])
# Predict using the trained model
prediction = best_clf.predict(input_data)
# Return the prediction
return "Diabetic" if prediction == 1 else "Non-Diabetic"
import gradio as gr
# Create the Gradio interface using the latest syntax
iface = gr.Interface(
fn=predict_diabetes, # The function you defined for prediction
gr.Radio(choices=[0, 1, 2], label="Gender (0 = Female, 1 = Male, 2 = Other)"),
gr.Radio(choices=[0, 1], label="Hypertension (0 = No, 1 = Yes)"),
gr.Radio(choices=[0, 1], label="Heart Disease (0 = No, 1 = Yes)"),
gr.Dropdown(choices=list(smoking_mapping.keys()), label="Smoking History"),
gr.Number(label="HbA1c Level"),
gr.Number(label="Blood Glucose Level"),
title="Diabetes Prediction",
description="Input the features to predict if the person has diabetes."
# Launch the Gradio interface
iface.launch(share=True, inline = False, auth=('user', '12345'), auth_message='Username = user\nPass = 12345')