Spaces:
Sleeping
Sleeping
##### Import libraries necessary for this project | |
import numpy as np | |
import pandas as pd | |
# time ML models take to execute to train different models | |
from time import time | |
# Allows the use of display() for DataFrames | |
from IPython.display import display | |
import matplotlib.pyplot as plt | |
import sklearn | |
# Pretty display for notebooks | |
# Load the diabetes dataset | |
data = pd.read_csv("diabetes_prediction_dataset.csv") | |
# Success - Display records of first 5 data line | |
display(data.head(10)) | |
data.bmi[data.bmi>80].value_counts().sort_values() | |
data.smoking_history.unique() | |
data.gender.unique() | |
df = pd.DataFrame(data) | |
# Filter the DataFrame | |
df = df[df['age'] >= 10] | |
from sklearn.preprocessing import LabelEncoder | |
# Convert gender to binary format | |
gender_encoder = LabelEncoder() | |
df['gender'] = gender_encoder.fit_transform(df['gender']) | |
# Convert smoking history to integer format | |
smoking_mapping = { | |
'never': 0, | |
'No Info': 1, | |
'current': 2, | |
'former': 3, | |
'ever': 4, | |
'not current': 5 | |
} | |
df['smoking_history'] = df['smoking_history'].map(smoking_mapping) | |
# Display the updated DataFrame | |
print(df) | |
df.age[data.age<10].value_counts() | |
df.smoking_history.unique() | |
df.gender.unique() | |
df.describe() | |
print(df.isnull().any().any()) | |
#Total number of records | |
n_records = len(df.index) | |
#Number of records where outcome = 1 | |
n_1 = len(df[df.diabetes == 1]) | |
#Number of records where outcome = 0 | |
""" | |
data[data.Outcome == 0], it gives a tuple with the dimension of matrix i.e; (3,2) | |
.shape[0]: | |
This extracts the first element of the tuple returned by .shape, which is the | |
number of rows in the DataFrame. | |
""" | |
n_0 = df[df.diabetes == 0].shape[0] | |
#Percentage of individuals whose Outcome is 1 | |
n1_perc = (n_1/n_records) * 100 | |
# Print the results | |
print("Total number of records: {}".format(n_records)) | |
print("Number of persons diagonised with diabetes : {}".format(n_1)) | |
print("Number of persons not having diabetes : {}".format(n_0)) | |
print("Percentage of people who are Diabetic : {}%".format(n1_perc)) | |
# Splitting into features (X) and target label (y) | |
features_final = df.drop('diabetes', axis=1) # Features (excluding 'diabetes') | |
outcome_r = df['diabetes'] # Target label ('diabetes') | |
### Visualizing Skewed Continuous Features | |
# Setting up figure for subplots | |
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 12)) | |
fig.subplots_adjust(hspace=0.5) | |
# Plotting each feature | |
for ax, column in zip(axes.flatten(), features_final.columns): | |
ax.hist(features_final[column], bins=25, color='skyblue', edgecolor='black', linewidth=1.2) | |
ax.set_title(f"{column} Distribution") | |
ax.set_xlabel("Value") | |
ax.set_ylabel("Frequency") | |
plt.show() | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from imblearn.over_sampling import SMOTE | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import classification_report, confusion_matrix | |
# Separate features and target | |
X = df.drop('diabetes', axis=1) | |
y = df['diabetes'] | |
# Split the data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
# Apply SMOTE to the training data | |
smote = SMOTE(random_state=42) | |
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train) | |
# Train a model | |
model = RandomForestClassifier(random_state=42) | |
model.fit(X_train_sm, y_train_sm) | |
# Predict on the test set | |
y_pred = model.predict(X_test) | |
# Evaluate the model | |
print(confusion_matrix(y_test, y_pred)) | |
print(classification_report(y_test, y_pred)) | |
# Show the results of the split | |
print("Training set has {} samples.".format(X_train.shape[0])) | |
print("Testing set has {} samples.".format(X_test.shape[0])) | |
# Import train_test_split | |
from sklearn.model_selection import train_test_split | |
# Split the 'features' and 'income' data into training and testing sets | |
""" | |
When splitting a dataset into training and testing sets, setting a | |
random_state ensures that the split is the same every time you run the code. | |
""" | |
X_train, X_test, y_train, y_test = train_test_split(features_final, | |
outcome_r, | |
test_size = 0.2, | |
random_state = 42) | |
# Show the results of the split | |
print("Training set has {} samples.".format(X_train.shape[0])) | |
print("Testing set has {} samples.".format(X_test.shape[0])) | |
#Import two metrics from sklearn - fbeta_score and accuracy_score | |
""" | |
The F-score, or F1-score, is a measure of a model's accuracy on a dataset. It is a weighted | |
average of precision and recall. The F1-score is the harmonic mean of precision and recall, | |
giving equal importance to both. | |
fbeta_score(y_true, y_pred, pos_label=1, average='binary', beta=0.5): | |
y_true: True labels. | |
y_pred: Predicted labels. | |
pos_label: The positive class label. | |
average: Method to calculate the F-score for binary or multiclass classification. | |
beta: Weight of recall in the F-score. | |
beta: | |
The beta parameter determines the weight of recall in the F-score. | |
When beta is 1, the F-score is the harmonic mean of precision and recall (F1-score). | |
When beta is less than 1 (e.g., beta=0.5), precision is given more weight. | |
When beta is greater than 1, recall is given more weight. | |
""" | |
from time import time | |
from sklearn.metrics import fbeta_score,accuracy_score | |
def train_predict(learner, sample_size, X_train_sm, y_train_sm, X_test, y_test): | |
''' | |
inputs: | |
- learner: the learning algorithm to be trained and predicted on | |
- sample_size: the size of samples (number) to be drawn from training set | |
- X_train: features training set | |
- y_train: income training set | |
- X_test: features testing set | |
- y_test: income testing set | |
''' | |
results = {} | |
#Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:]) | |
start = time() | |
learner = learner.fit(X_train_sm[:sample_size],y_train_sm[:sample_size]) | |
end = time() | |
results['train_time'] = end - start | |
# Get the predictions on the test set(X_test), | |
# then get predictions on the first 300 training samples(X_train) using .predict() | |
start = time() | |
predictions_test = learner.predict(X_test) | |
predictions_train = learner.predict(X_train_sm[:300]) | |
end = time() | |
results['pred_time'] = end - start | |
# Compute accuracy on the first 300 training samples which is y_train[:300] | |
results['acc_train'] = accuracy_score(y_train_sm[:300], predictions_train) | |
# Compute accuracy on test set using accuracy_score() | |
results['acc_test'] = accuracy_score(y_test,predictions_test) | |
# Compute F-score on the the first 300 training samples using fbeta_score() | |
results['f_train'] = fbeta_score(y_train_sm[:300],predictions_train,pos_label=1, average= 'binary',beta =0.5) | |
# Compute F-score on the test set which is y_test | |
results['f_test'] = fbeta_score(y_test,predictions_test,pos_label=1, average= 'binary',beta =0.5) | |
# Success | |
print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size)) | |
# Return the results | |
return results | |
#Import the three supervised learning models from sklearn | |
""" | |
LogisticRegression: A linear model for binary classification. | |
RandomForestClassifier: An ensemble method that uses multiple decision trees and | |
averages their predictions. | |
AdaBoostClassifier: An ensemble method that combines multiple weak classifiers | |
(like decision trees) to create a strong classifier. | |
DecisionTreeClassifier: A decision tree classifier, used as a base estimator in | |
AdaBoost. | |
""" | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import AdaBoostClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
#Initialize the three models | |
clf_A = LogisticRegression(random_state=42) | |
clf_B = RandomForestClassifier() | |
clf_C = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),random_state=42) | |
#Calculate the number of samples for 1%, 10%, and 100% of the training data | |
#samples_100 is the entire training set i.e. len(y_train) | |
#samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`) | |
#samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`) | |
samples_100 = len(y_train) | |
samples_10 = int(0.1 * samples_100) | |
samples_1 = int(0.01 * samples_100) | |
# Collect results on the learners | |
results = {} | |
for clf in [clf_A, clf_B, clf_C]: | |
clf_name = clf.__class__.__name__ | |
results[clf_name] = {} | |
for i, samples in enumerate([samples_1, samples_10, samples_100]): | |
results[clf_name][i] = \ | |
train_predict(clf, samples, X_train, y_train, X_test, y_test) | |
results | |
# Imports the module mpatches from matplotlib, which is used to create legend patches. | |
import matplotlib.patches as mpatches | |
def evaluate(results, accuracy, f1): | |
""" | |
Visualization code to display results of various learners. | |
inputs: | |
- learners: a list of supervised learners | |
- stats: a list of dictionaries of the statistic results from 'train_predict()' | |
- accuracy: The score for the naive predictor | |
- f1: The score for the naive predictor | |
""" | |
# Create figure | |
fig, ax = plt.subplots(2, 3, figsize = (11,11)) | |
# Constants | |
bar_width = 0.3 | |
colors = ['#A00000','#00A0A0','#00A000'] | |
# Super loop to plot four panels of data | |
# iterates through each learner/model in results | |
for k, learner in enumerate(results.keys()): | |
# Iterates through each metric (like training time, accuracy, etc.) to be plotted. | |
for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']): | |
# Iterates over the three different sample sizes (1%, 10%, and 100%). | |
for i in np.arange(3): | |
# Creative plot code | |
""" | |
// operator performs integer (or floor) division | |
j//3: Integer division of j by 3 gives the row index (0 or 1). | |
j%3: Modulo operation gives the column index (0, 1, or 2). | |
The x-position of the bar. i is the index of the training set size (0, 1, or 2 for "1%", "10%", "100%"). | |
k*bar_width offsets the bars of different learners/models so they don’t overlap. | |
""" | |
ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k]) | |
ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45]) | |
ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"]) | |
ax[j//3, j%3].set_xlabel("Training Set Size") | |
ax[j//3, j%3].set_xlim((-0.1, 3.0)) | |
# Add unique y-labels | |
ax[0, 0].set_ylabel("Time (in seconds)") | |
ax[0, 1].set_ylabel("Accuracy Score") | |
ax[0, 2].set_ylabel("F-score") | |
ax[1, 0].set_ylabel("Time (in seconds)") | |
ax[1, 1].set_ylabel("Accuracy Score") | |
ax[1, 2].set_ylabel("F-score") | |
# Add titles | |
ax[0, 0].set_title("Model Training") | |
ax[0, 1].set_title("Accuracy Score on Training Subset") | |
ax[0, 2].set_title("F-score on Training Subset") | |
ax[1, 0].set_title("Model Predicting") | |
ax[1, 1].set_title("Accuracy Score on Testing Set") | |
ax[1, 2].set_title("F-score on Testing Set") | |
# Add horizontal lines for naive predictors | |
""" | |
The first and second subplots in the second column (accuracy plots) | |
get a horizontal line at the accuracy value. | |
The first and second subplots in the third column (F1 score plots) get a horizontal line | |
at the f1 value. | |
""" | |
ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') | |
ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') | |
ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') | |
ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed') | |
# Set y-limits for score panels | |
ax[0, 1].set_ylim((0, 1)) | |
ax[0, 2].set_ylim((0, 1)) | |
ax[1, 1].set_ylim((0, 1)) | |
ax[1, 2].set_ylim((0, 1)) | |
# Create patches for the legend | |
patches = [] | |
for i, learner in enumerate(results.keys()): | |
patches.append(mpatches.Patch(color = colors[i], label = learner)) | |
plt.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \ | |
loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'large') | |
# Aesthetics | |
plt.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 10, y = 1.10) | |
plt.tight_layout(pad = 8) | |
plt.show() | |
#Calculate accuracy, precision and recall | |
accuracy = n_1/n_records | |
precision = n_1/n_records | |
recall = np.sum(outcome_r)/np.sum(outcome_r) | |
# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall. | |
fscore = (1+np.square(0.5))*precision*recall/((np.square(0.5)*precision)+recall) | |
# Print the results | |
print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore)) | |
evaluate(results, accuracy, fscore) | |
import warnings | |
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") | |
#Import necessary libraries | |
from sklearn.ensemble import RandomForestClassifier | |
""" | |
GridSearchCV is a tool provided by sklearn for hyperparameter tuning. It performs an exhaustive | |
search over a specified parameter grid to find the optimal hyperparameters for a given model. | |
""" | |
from sklearn.model_selection import GridSearchCV | |
""" | |
make_scorer is used to convert a metric function into a scorer object that can be used by | |
GridSearchCV and other tools that require a scoring function. | |
""" | |
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score | |
#Initialize the classifier | |
clf = RandomForestClassifier(random_state=42) | |
parameters = {'n_estimators':[5,10,15,20,25],'max_depth': [2, 4, 6, 8, 10]} | |
#Creating the fbeta_score and accuracy_score scoring objects | |
scorer = make_scorer(fbeta_score, beta=0.5) | |
acc_scorer = make_scorer(accuracy_score) | |
#Perform grid search on classifier using GridSearchCV() | |
grid_obj = GridSearchCV(clf, parameters, scoring=scorer) | |
#Fit the grid search object to the training data and find the optimal parameters using fit() | |
grid_fit = grid_obj.fit(X_train, y_train) | |
""" | |
Cross-Validation: The training data is split into k folds (default is usually 5), | |
and the model is trained on k-1 folds and validated on the remaining fold. This process is | |
repeated k times, each time with a different fold as the validation set. | |
Hyperparameter Combinations: Each combination of hyperparameters is tested by training the model on | |
the training data and evaluating its performance on the validation data. | |
Scoring: For each fold, the model's performance is evaluated using the specified scoring method | |
(fbeta_score with beta=0.5 in this case). | |
Average Score: The average score across all folds is calculated for each hyperparameter combination. | |
Best Parameters: The combination of hyperparameters with the highest average score is selected as | |
the best. | |
""" | |
#Get the best estimator for classifier | |
best_clf = grid_fit.best_estimator_ | |
#Make predictions using the unoptimized and optimized classifiers | |
predictions = clf.fit(X_train, y_train).predict(X_test) | |
best_predictions = best_clf.predict(X_test) | |
#Print the results | |
print("Random Forest") | |
print("Unoptimized model accuracy: {:.4f}".format(accuracy_score(y_test, predictions))) | |
print("Optimized model accuracy: {:.4f}".format(accuracy_score(y_test, best_predictions))) | |
print("Unoptimized model F-score: {:.4f}".format(fbeta_score(y_test, predictions, beta=0.5))) | |
print("Optimized model F-score: {:.4f}".format(fbeta_score(y_test, best_predictions, beta=0.5))) | |
def feature_plot(importances, X_train, y_train): | |
# Display the three most important features | |
# indices me indexes aa rhe hai lol argsort gives indexes and sort gives the values | |
indices = np.argsort(importances)[::-1] | |
# this will extract the names of columns | |
columns = X_train.columns.values[indices[:5]] | |
# yeh to value of impostance of top 3 columns dera hai | |
values = importances[indices][:5] | |
# Creat the plot | |
fig = plt.figure(figsize=(10, 6)) | |
plt.title("Normalized Weights for First Five Most Predictive Features", fontsize=16) | |
# Plotting feature weights | |
plt.bar(np.arange(5), values, width=0.4, align="center", color='#00A000', label="Feature Weight") | |
# Plotting cumulative feature weights | |
plt.bar(np.arange(5) - 0.2, np.cumsum(values), width=0.4, align="center", color='#00A0A0', label="Cumulative Feature Weight") | |
plt.xticks(np.arange(5), columns, rotation=45) | |
plt.xlabel("Feature", fontsize=12) | |
plt.ylabel("Weight", fontsize=12) | |
plt.legend(loc='upper center') | |
plt.tight_layout() | |
plt.show() | |
#Extracting important features | |
#Import a supervised learning model that has 'feature_importances_' | |
from sklearn.ensemble import RandomForestClassifier | |
#Train the supervised model on the training set using .fit(X_train, y_train) | |
model = best_clf | |
#Extract the feature importances using .feature_importances_ | |
importances = model.feature_importances_ | |
# Plot | |
feature_plot(importances, X_train, y_train) | |
# Import functionality for cloning a model | |
from sklearn.base import clone | |
# Reduce the feature space | |
X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]] | |
X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]] | |
# Train on the "best" model found from grid search earlier | |
clf = (clone(best_clf)).fit(X_train_reduced, y_train) | |
# Make new predictions | |
reduced_predictions = clf.predict(X_test_reduced) | |
# Report scores from the final model using both versions of data | |
print("Final Model trained on full data\n------") | |
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) | |
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))) | |
print("\nFinal Model trained on reduced data\n------") | |
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))) | |
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5))) | |
import numpy as np | |
# Define the prediction function | |
def predict_diabetes(gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level): | |
# Map the input values as per your preprocessing | |
smoking_mapping = { | |
'never': 0, | |
'No Info': 1, | |
'current': 2, | |
'former': 3, | |
'ever': 4, | |
'not current': 5 | |
} | |
# Prepare the input data in the same format as the training data | |
input_data = np.array([[gender, age, hypertension, heart_disease, smoking_mapping[smoking_history], bmi, HbA1c_level, blood_glucose_level]]) | |
# Predict using the trained model | |
prediction = best_clf.predict(input_data) | |
# Return the prediction | |
return "Diabetic" if prediction == 1 else "Non-Diabetic" | |
predict_diabetes(0,21,0,0,"never",24.00,5.1,104) | |
import gradio as gr | |
# Create the Gradio interface using the latest syntax | |
iface = gr.Interface( | |
fn=predict_diabetes, # The function you defined for prediction | |
inputs=[ | |
gr.Radio(choices=[0, 1, 2], label="Gender (0 = Female, 1 = Male, 2 = Other)"), | |
gr.Number(label="Age"), | |
gr.Radio(choices=[0, 1], label="Hypertension (0 = No, 1 = Yes)"), | |
gr.Radio(choices=[0, 1], label="Heart Disease (0 = No, 1 = Yes)"), | |
gr.Dropdown(choices=list(smoking_mapping.keys()), label="Smoking History"), | |
gr.Number(label="BMI"), | |
gr.Number(label="HbA1c Level"), | |
gr.Number(label="Blood Glucose Level"), | |
], | |
outputs="text", | |
title="Diabetes Prediction", | |
description="Input the features to predict if the person has diabetes." | |
) | |
# Launch the Gradio interface | |
iface.launch(share=True, inline = False, auth=('user', '12345'), auth_message='Username = user\nPass = 12345') | |