Spaces:

yadvi
/

diabetes_prediction

Sleeping

App Files Files Community

diabetes_prediction / app.py

yadvi

Update app.py

9f3e4ce verified 8 months ago

raw

history blame contribute delete

20.3 kB

	##### Import libraries necessary for this project
	import numpy as np
	import pandas as pd

	# time ML models take to execute to train different models
	from time import time

	# Allows the use of display() for DataFrames
	from IPython.display import display

	import matplotlib.pyplot as plt
	import sklearn
	# Pretty display for notebooks

	# Load the diabetes dataset
	data = pd.read_csv("diabetes_prediction_dataset.csv")

	# Success - Display records of first 5 data line
	display(data.head(10))

	data.bmi[data.bmi>80].value_counts().sort_values()

	data.smoking_history.unique()

	data.gender.unique()

	df = pd.DataFrame(data)

	# Filter the DataFrame
	df = df[df['age'] >= 10]

	from sklearn.preprocessing import LabelEncoder

	# Convert gender to binary format
	gender_encoder = LabelEncoder()
	df['gender'] = gender_encoder.fit_transform(df['gender'])

	# Convert smoking history to integer format
	smoking_mapping = {
	'never': 0,
	'No Info': 1,
	'current': 2,
	'former': 3,
	'ever': 4,
	'not current': 5
	}
	df['smoking_history'] = df['smoking_history'].map(smoking_mapping)

	# Display the updated DataFrame
	print(df)

	df.age[data.age<10].value_counts()

	df.smoking_history.unique()

	df.gender.unique()

	df.describe()

	print(df.isnull().any().any())

	#Total number of records
	n_records = len(df.index)

	#Number of records where outcome = 1
	n_1 = len(df[df.diabetes == 1])

	#Number of records where outcome = 0
	"""
	data[data.Outcome == 0], it gives a tuple with the dimension of matrix i.e; (3,2)
	.shape[0]:
	This extracts the first element of the tuple returned by .shape, which is the
	number of rows in the DataFrame.
	"""
	n_0 = df[df.diabetes == 0].shape[0]

	#Percentage of individuals whose Outcome is 1
	n1_perc = (n_1/n_records) * 100

	# Print the results
	print("Total number of records: {}".format(n_records))
	print("Number of persons diagonised with diabetes : {}".format(n_1))
	print("Number of persons not having diabetes : {}".format(n_0))
	print("Percentage of people who are Diabetic : {}%".format(n1_perc))

	# Splitting into features (X) and target label (y)
	features_final = df.drop('diabetes', axis=1) # Features (excluding 'diabetes')
	outcome_r = df['diabetes'] # Target label ('diabetes')

	### Visualizing Skewed Continuous Features

	# Setting up figure for subplots
	fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
	fig.subplots_adjust(hspace=0.5)

	# Plotting each feature
	for ax, column in zip(axes.flatten(), features_final.columns):
	ax.hist(features_final[column], bins=25, color='skyblue', edgecolor='black', linewidth=1.2)
	ax.set_title(f"{column} Distribution")
	ax.set_xlabel("Value")
	ax.set_ylabel("Frequency")

	plt.show()

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from imblearn.over_sampling import SMOTE
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import classification_report, confusion_matrix


	# Separate features and target
	X = df.drop('diabetes', axis=1)
	y = df['diabetes']

	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

	# Apply SMOTE to the training data
	smote = SMOTE(random_state=42)
	X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

	# Train a model
	model = RandomForestClassifier(random_state=42)
	model.fit(X_train_sm, y_train_sm)

	# Predict on the test set
	y_pred = model.predict(X_test)

	# Evaluate the model
	print(confusion_matrix(y_test, y_pred))
	print(classification_report(y_test, y_pred))
	# Show the results of the split
	print("Training set has {} samples.".format(X_train.shape[0]))
	print("Testing set has {} samples.".format(X_test.shape[0]))

	# Import train_test_split
	from sklearn.model_selection import train_test_split

	# Split the 'features' and 'income' data into training and testing sets
	"""
	When splitting a dataset into training and testing sets, setting a
	random_state ensures that the split is the same every time you run the code.
	"""
	X_train, X_test, y_train, y_test = train_test_split(features_final,
	outcome_r,
	test_size = 0.2,
	random_state = 42)

	# Show the results of the split
	print("Training set has {} samples.".format(X_train.shape[0]))
	print("Testing set has {} samples.".format(X_test.shape[0]))

	#Import two metrics from sklearn - fbeta_score and accuracy_score

	"""
	The F-score, or F1-score, is a measure of a model's accuracy on a dataset. It is a weighted
	average of precision and recall. The F1-score is the harmonic mean of precision and recall,
	giving equal importance to both.

	fbeta_score(y_true, y_pred, pos_label=1, average='binary', beta=0.5):
	y_true: True labels.
	y_pred: Predicted labels.
	pos_label: The positive class label.
	average: Method to calculate the F-score for binary or multiclass classification.
	beta: Weight of recall in the F-score.

	beta:
	The beta parameter determines the weight of recall in the F-score.
	When beta is 1, the F-score is the harmonic mean of precision and recall (F1-score).
	When beta is less than 1 (e.g., beta=0.5), precision is given more weight.
	When beta is greater than 1, recall is given more weight.
	"""

	from time import time
	from sklearn.metrics import fbeta_score,accuracy_score
	def train_predict(learner, sample_size, X_train_sm, y_train_sm, X_test, y_test):
	'''
	inputs:
	- learner: the learning algorithm to be trained and predicted on
	- sample_size: the size of samples (number) to be drawn from training set
	- X_train: features training set
	- y_train: income training set
	- X_test: features testing set
	- y_test: income testing set
	'''
	results = {}

	#Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
	start = time()
	learner = learner.fit(X_train_sm[:sample_size],y_train_sm[:sample_size])
	end = time()
	results['train_time'] = end - start

	# Get the predictions on the test set(X_test),
	# then get predictions on the first 300 training samples(X_train) using .predict()
	start = time()
	predictions_test = learner.predict(X_test)
	predictions_train = learner.predict(X_train_sm[:300])
	end = time()
	results['pred_time'] = end - start

	# Compute accuracy on the first 300 training samples which is y_train[:300]
	results['acc_train'] = accuracy_score(y_train_sm[:300], predictions_train)

	# Compute accuracy on test set using accuracy_score()
	results['acc_test'] = accuracy_score(y_test,predictions_test)

	# Compute F-score on the the first 300 training samples using fbeta_score()
	results['f_train'] = fbeta_score(y_train_sm[:300],predictions_train,pos_label=1, average= 'binary',beta =0.5)

	# Compute F-score on the test set which is y_test
	results['f_test'] = fbeta_score(y_test,predictions_test,pos_label=1, average= 'binary',beta =0.5)

	# Success
	print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))

	# Return the results
	return results

	#Import the three supervised learning models from sklearn
	"""
	LogisticRegression: A linear model for binary classification.

	RandomForestClassifier: An ensemble method that uses multiple decision trees and
	averages their predictions.

	AdaBoostClassifier: An ensemble method that combines multiple weak classifiers
	(like decision trees) to create a strong classifier.

	DecisionTreeClassifier: A decision tree classifier, used as a base estimator in
	AdaBoost.
	"""
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.ensemble import AdaBoostClassifier
	from sklearn.tree import DecisionTreeClassifier

	#Initialize the three models
	clf_A = LogisticRegression(random_state=42)
	clf_B = RandomForestClassifier()
	clf_C = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),random_state=42)

	#Calculate the number of samples for 1%, 10%, and 100% of the training data
	#samples_100 is the entire training set i.e. len(y_train)
	#samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
	#samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
	samples_100 = len(y_train)
	samples_10 = int(0.1 * samples_100)
	samples_1 = int(0.01 * samples_100)

	# Collect results on the learners
	results = {}
	for clf in [clf_A, clf_B, clf_C]:
	clf_name = clf.__class__.__name__
	results[clf_name] = {}
	for i, samples in enumerate([samples_1, samples_10, samples_100]):
	results[clf_name][i] = \
	train_predict(clf, samples, X_train, y_train, X_test, y_test)


	results

	# Imports the module mpatches from matplotlib, which is used to create legend patches.
	import matplotlib.patches as mpatches
	def evaluate(results, accuracy, f1):
	"""
	Visualization code to display results of various learners.

	inputs:
	- learners: a list of supervised learners
	- stats: a list of dictionaries of the statistic results from 'train_predict()'
	- accuracy: The score for the naive predictor
	- f1: The score for the naive predictor
	"""

	# Create figure
	fig, ax = plt.subplots(2, 3, figsize = (11,11))

	# Constants
	bar_width = 0.3
	colors = ['#A00000','#00A0A0','#00A000']

	# Super loop to plot four panels of data
	# iterates through each learner/model in results
	for k, learner in enumerate(results.keys()):

	# Iterates through each metric (like training time, accuracy, etc.) to be plotted.
	for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):

	# Iterates over the three different sample sizes (1%, 10%, and 100%).
	for i in np.arange(3):

	# Creative plot code
	"""
	// operator performs integer (or floor) division
	j//3: Integer division of j by 3 gives the row index (0 or 1).
	j%3: Modulo operation gives the column index (0, 1, or 2).

	The x-position of the bar. i is the index of the training set size (0, 1, or 2 for "1%", "10%", "100%").
	k*bar_width offsets the bars of different learners/models so they don’t overlap.

	"""
	ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
	ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45])
	ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"])
	ax[j//3, j%3].set_xlabel("Training Set Size")
	ax[j//3, j%3].set_xlim((-0.1, 3.0))

	# Add unique y-labels
	ax[0, 0].set_ylabel("Time (in seconds)")
	ax[0, 1].set_ylabel("Accuracy Score")
	ax[0, 2].set_ylabel("F-score")
	ax[1, 0].set_ylabel("Time (in seconds)")
	ax[1, 1].set_ylabel("Accuracy Score")
	ax[1, 2].set_ylabel("F-score")

	# Add titles
	ax[0, 0].set_title("Model Training")
	ax[0, 1].set_title("Accuracy Score on Training Subset")
	ax[0, 2].set_title("F-score on Training Subset")
	ax[1, 0].set_title("Model Predicting")
	ax[1, 1].set_title("Accuracy Score on Testing Set")
	ax[1, 2].set_title("F-score on Testing Set")

	# Add horizontal lines for naive predictors
	"""
	The first and second subplots in the second column (accuracy plots)
	get a horizontal line at the accuracy value.
	The first and second subplots in the third column (F1 score plots) get a horizontal line
	at the f1 value.
	"""
	ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
	ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
	ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
	ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')

	# Set y-limits for score panels
	ax[0, 1].set_ylim((0, 1))
	ax[0, 2].set_ylim((0, 1))
	ax[1, 1].set_ylim((0, 1))
	ax[1, 2].set_ylim((0, 1))

	# Create patches for the legend
	patches = []
	for i, learner in enumerate(results.keys()):
	patches.append(mpatches.Patch(color = colors[i], label = learner))
	plt.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
	loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'large')

	# Aesthetics
	plt.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 10, y = 1.10)
	plt.tight_layout(pad = 8)
	plt.show()


	#Calculate accuracy, precision and recall
	accuracy = n_1/n_records
	precision = n_1/n_records
	recall = np.sum(outcome_r)/np.sum(outcome_r)

	# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
	fscore = (1+np.square(0.5))precisionrecall/((np.square(0.5)*precision)+recall)

	# Print the results
	print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

	evaluate(results, accuracy, fscore)

	import warnings
	warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")

	#Import necessary libraries
	from sklearn.ensemble import RandomForestClassifier

	"""
	GridSearchCV is a tool provided by sklearn for hyperparameter tuning. It performs an exhaustive
	search over a specified parameter grid to find the optimal hyperparameters for a given model.
	"""
	from sklearn.model_selection import GridSearchCV

	"""
	make_scorer is used to convert a metric function into a scorer object that can be used by
	GridSearchCV and other tools that require a scoring function.
	"""
	from sklearn.metrics import make_scorer, fbeta_score, accuracy_score

	#Initialize the classifier
	clf = RandomForestClassifier(random_state=42)

	parameters = {'n_estimators':[5,10,15,20,25],'max_depth': [2, 4, 6, 8, 10]}

	#Creating the fbeta_score and accuracy_score scoring objects
	scorer = make_scorer(fbeta_score, beta=0.5)
	acc_scorer = make_scorer(accuracy_score)

	#Perform grid search on classifier using GridSearchCV()
	grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

	#Fit the grid search object to the training data and find the optimal parameters using fit()
	grid_fit = grid_obj.fit(X_train, y_train)

	"""
	Cross-Validation: The training data is split into k folds (default is usually 5),
	and the model is trained on k-1 folds and validated on the remaining fold. This process is
	repeated k times, each time with a different fold as the validation set.

	Hyperparameter Combinations: Each combination of hyperparameters is tested by training the model on
	the training data and evaluating its performance on the validation data.

	Scoring: For each fold, the model's performance is evaluated using the specified scoring method
	(fbeta_score with beta=0.5 in this case).

	Average Score: The average score across all folds is calculated for each hyperparameter combination.

	Best Parameters: The combination of hyperparameters with the highest average score is selected as
	the best.
	"""

	#Get the best estimator for classifier
	best_clf = grid_fit.best_estimator_

	#Make predictions using the unoptimized and optimized classifiers
	predictions = clf.fit(X_train, y_train).predict(X_test)
	best_predictions = best_clf.predict(X_test)

	#Print the results
	print("Random Forest")
	print("Unoptimized model accuracy: {:.4f}".format(accuracy_score(y_test, predictions)))
	print("Optimized model accuracy: {:.4f}".format(accuracy_score(y_test, best_predictions)))
	print("Unoptimized model F-score: {:.4f}".format(fbeta_score(y_test, predictions, beta=0.5)))
	print("Optimized model F-score: {:.4f}".format(fbeta_score(y_test, best_predictions, beta=0.5)))


	def feature_plot(importances, X_train, y_train):

	# Display the three most important features
	# indices me indexes aa rhe hai lol argsort gives indexes and sort gives the values
	indices = np.argsort(importances)[::-1]

	# this will extract the names of columns
	columns = X_train.columns.values[indices[:5]]

	# yeh to value of impostance of top 3 columns dera hai
	values = importances[indices][:5]

	# Creat the plot
	fig = plt.figure(figsize=(10, 6))
	plt.title("Normalized Weights for First Five Most Predictive Features", fontsize=16)

	# Plotting feature weights
	plt.bar(np.arange(5), values, width=0.4, align="center", color='#00A000', label="Feature Weight")

	# Plotting cumulative feature weights
	plt.bar(np.arange(5) - 0.2, np.cumsum(values), width=0.4, align="center", color='#00A0A0', label="Cumulative Feature Weight")

	plt.xticks(np.arange(5), columns, rotation=45)
	plt.xlabel("Feature", fontsize=12)
	plt.ylabel("Weight", fontsize=12)
	plt.legend(loc='upper center')
	plt.tight_layout()
	plt.show()

	#Extracting important features
	#Import a supervised learning model that has 'feature_importances_'
	from sklearn.ensemble import RandomForestClassifier

	#Train the supervised model on the training set using .fit(X_train, y_train)
	model = best_clf

	#Extract the feature importances using .feature_importances_
	importances = model.feature_importances_

	# Plot
	feature_plot(importances, X_train, y_train)

	# Import functionality for cloning a model
	from sklearn.base import clone

	# Reduce the feature space
	X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
	X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]

	# Train on the "best" model found from grid search earlier
	clf = (clone(best_clf)).fit(X_train_reduced, y_train)

	# Make new predictions
	reduced_predictions = clf.predict(X_test_reduced)

	# Report scores from the final model using both versions of data
	print("Final Model trained on full data\n------")
	print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
	print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
	print("\nFinal Model trained on reduced data\n------")
	print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions)))
	print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5)))

	import numpy as np

	# Define the prediction function
	def predict_diabetes(gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level):
	# Map the input values as per your preprocessing
	smoking_mapping = {
	'never': 0,
	'No Info': 1,
	'current': 2,
	'former': 3,
	'ever': 4,
	'not current': 5
	}

	# Prepare the input data in the same format as the training data
	input_data = np.array([[gender, age, hypertension, heart_disease, smoking_mapping[smoking_history], bmi, HbA1c_level, blood_glucose_level]])

	# Predict using the trained model
	prediction = best_clf.predict(input_data)

	# Return the prediction
	return "Diabetic" if prediction == 1 else "Non-Diabetic"

	predict_diabetes(0,21,0,0,"never",24.00,5.1,104)


	import gradio as gr

	# Create the Gradio interface using the latest syntax
	iface = gr.Interface(
	fn=predict_diabetes, # The function you defined for prediction
	inputs=[
	gr.Radio(choices=[0, 1, 2], label="Gender (0 = Female, 1 = Male, 2 = Other)"),
	gr.Number(label="Age"),
	gr.Radio(choices=[0, 1], label="Hypertension (0 = No, 1 = Yes)"),
	gr.Radio(choices=[0, 1], label="Heart Disease (0 = No, 1 = Yes)"),
	gr.Dropdown(choices=list(smoking_mapping.keys()), label="Smoking History"),
	gr.Number(label="BMI"),
	gr.Number(label="HbA1c Level"),
	gr.Number(label="Blood Glucose Level"),
	],
	outputs="text",
	title="Diabetes Prediction",
	description="Input the features to predict if the person has diabetes."
	)

	# Launch the Gradio interface
	iface.launch(share=True, inline = False, auth=('user', '12345'), auth_message='Username = user\nPass = 12345')