Spaces:

drwaseem
/

test

Runtime error

App Files Files Community

test / app.py

drwaseem

Update app.py

7e62018 verified 3 months ago

raw

history blame contribute delete

10.9 kB

	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
	from sklearn.ensemble import GradientBoostingClassifier
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	import joblib
	from google.colab import files
	import matplotlib.pyplot as plt
	import seaborn as sns
	from mpl_toolkits.mplot3d import Axes3D

	# Step 1: Upload CSV File (if user wants to)
	def upload_csv_file():
	print("Please upload your CSV file.")
	uploaded = files.upload()

	# Assuming a single CSV file is uploaded
	file_name = next(iter(uploaded)) # Get the name of the uploaded file
	df = pd.read_csv(file_name)

	# Print the column names to check for mismatches
	print("Columns in the uploaded file:")
	print(df.columns)

	return df

	# Step 2: Generate Synthetic Dataset (if no CSV uploaded)
	def generate_synthetic_data():
	np.random.seed(42)

	# Synthetic data (100 rows, 11 columns)
	data = {
	'Age': np.random.randint(40, 80, 100),
	'Gender': np.random.choice(['Male', 'Female'], 100),
	'Cholesterol': np.random.randint(150, 250, 100),
	'Systolic_BP': np.random.randint(90, 180, 100),
	'Diastolic_BP': np.random.randint(60, 120, 100),
	'Cognitive_Score1': np.random.randint(1, 4, 100),
	'Cognitive_Score2': np.random.randint(1, 4, 100),
	'Cognitive_Score3': np.random.randint(1, 4, 100),
	'Family_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes
	'Medical_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes
	'Test_Result': np.random.choice([0, 1], 100), # 0: Negative, 1: Positive
	'Alzheimers_Diagnosis': np.random.choice([0, 1], 100) # 0: No, 1: Yes
	}

	df = pd.DataFrame(data)
	return df

	# Step 3: Encode categorical variables
	def encode_categorical_columns(df):
	# List of categorical columns (to be encoded into numeric)
	categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History']

	label_encoder = LabelEncoder()

	for col in categorical_columns:
	if col in df.columns:
	df[col] = label_encoder.fit_transform(df[col])

	return df

	# Step 4: Train a model with the dataset using GradientBoostingClassifier
	def train_model(df):
	# Check if 'Alzheimers_Diagnosis' exists
	if 'Alzheimers_Diagnosis' not in df.columns:
	raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.")

	# Feature selection (exclude 'Alzheimers_Diagnosis' as target)
	features = df.drop(columns=['Alzheimers_Diagnosis'])
	target = df['Alzheimers_Diagnosis']

	# Encode categorical columns to numeric
	df = encode_categorical_columns(df)

	# Ensure all data passed to the scaler is numeric
	features = df.drop(columns=['Alzheimers_Diagnosis']) # Drop target column from features

	# Check for missing values
	if features.isnull().sum().any():
	print("Warning: Missing values found in features, filling with median.")
	features = features.fillna(features.median())

	if target.isnull().sum() > 0:
	print("Warning: Missing values found in target, filling with mode.")
	target = target.fillna(target.mode()[0])

	# Scale the data (important for some features like 'Age', 'Blood Pressure')
	scaler = StandardScaler()
	features_scaled = scaler.fit_transform(features)

	# Save the original feature names for plotting feature importance
	feature_names = features.columns # Get original column names before scaling

	# Split data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

	# Ensure all arrays have the same length
	print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}")
	print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
	print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

	# Initialize GradientBoostingClassifier
	model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42)

	# Hyperparameter tuning using GridSearchCV
	param_grid = {
	'n_estimators': [100, 300, 500],
	'learning_rate': [0.01, 0.05, 0.1],
	'max_depth': [3, 4, 5],
	'subsample': [0.8, 0.9, 1.0]
	}
	grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
	grid_search.fit(X_train, y_train)

	print("Best Hyperparameters found: ", grid_search.best_params_)

	# Use the best model from GridSearchCV
	best_model = grid_search.best_estimator_

	# Train the best model
	best_model.fit(X_train, y_train)

	# Make predictions
	y_pred = best_model.predict(X_test)

	# Calculate accuracy
	accuracy = accuracy_score(y_test, y_pred)
	print(f"Model accuracy: {accuracy * 100:.2f}%")

	# Cross-validation to get a better estimate of model performance
	cv_scores = cross_val_score(best_model, features_scaled, target, cv=5)
	print(f"Cross-validation scores: {cv_scores}")
	print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%")

	# Display classification report
	print("Classification Report:")
	print(classification_report(y_test, y_pred))

	# Save the trained model
	joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl')

	# Return the trained model for further use, along with test data for visualization
	probabilities = best_model.predict_proba(X_test)[:, 1] # Get probability of class '1' (Alzheimer's diagnosis)

	# Ensure that the length of X_test and y_test are consistent
	if len(X_test) != len(y_test):
	raise ValueError("X_test and y_test have inconsistent lengths.")

	return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names

	# Step 5: Predict Alzheimer's risk based on new data
	def predict_risk(model, scaler, new_data):
	# Scale the new data using the same scaler
	new_data_scaled = scaler.transform(new_data)

	# Predict Alzheimer's risk based on new data
	predictions = model.predict(new_data_scaled)

	# Predict probabilities for more interesting output
	probabilities = model.predict_proba(new_data_scaled)[:, 1] # Probability of class '1' (Alzheimer's diagnosis)

	return predictions, probabilities

	# Step 6: Display predictions and feature importance
	def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df):
	# Ensure consistent lengths
	if len(X_test) != len(y_test):
	raise ValueError("X_test and y_test must have the same length.")

	# Display predictions and probabilities
	print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):")
	print(y_pred)

	print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):")
	print(probabilities)

	# Display interesting insights based on probabilities
	print("\nRisk Insights:")
	for i, prob in enumerate(probabilities):
	if prob < 0.3:
	print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})")
	elif prob < 0.7:
	print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})")
	elif prob < 0.9:
	print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})")
	elif prob < 0.95:
	print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})")

	# Confusion Matrix
	cm = confusion_matrix(y_test, y_pred)
	plt.figure(figsize=(6, 6))
	sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
	plt.title("Confusion Matrix")
	plt.show()

	# ROC Curve
	fpr, tpr, thresholds = roc_curve(y_test, probabilities)
	roc_auc = auc(fpr, tpr)
	plt.figure(figsize=(8, 6))
	plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
	plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
	plt.xlim([0.0, 1.0])
	plt.ylim([0.0, 1.05])
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('ROC Curve')
	plt.legend(loc="lower right")
	plt.show()

	# Precision-Recall Curve
	precision, recall, _ = precision_recall_curve(y_test, probabilities)
	plt.figure(figsize=(8, 6))
	plt.plot(recall, precision, color='blue', lw=2)
	plt.xlabel('Recall')
	plt.ylabel('Precision')
	plt.title('Precision-Recall Curve')
	plt.show()

	# F1 Score
	f1 = f1_score(y_test, y_pred)
	print(f"F1 Score: {f1:.2f}")

	# Plot Feature Importance
	feature_importance = model.feature_importances_
	feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
	feature_df = feature_df.sort_values(by='Importance', ascending=False)

	plt.figure(figsize=(8, 6))
	sns.barplot(x='Importance', y='Feature', data=feature_df)
	plt.title("Feature Importance")
	plt.show()

	# Age Distribution Graph
	plt.figure(figsize=(8, 6))
	sns.histplot(df['Age'], kde=True)
	plt.title("Age Distribution")
	plt.xlabel("Age")
	plt.ylabel("Frequency")
	plt.show()

	# Pie chart for Gender distribution
	gender_dist = df['Gender'].value_counts()
	plt.figure(figsize=(6, 6))
	gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6'])
	plt.title("Gender Distribution")
	plt.ylabel('')
	plt.show()

	# 3D Scatter plot for Age vs Cognitive Scores vs Diagnosis
	fig = plt.figure(figsize=(10, 8))
	ax = fig.add_subplot(111, projection='3d')
	ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o')
	ax.set_xlabel('Age')
	ax.set_ylabel('Cognitive Score 1')
	ax.set_zlabel('Alzheimer\'s Diagnosis')
	ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)')
	plt.show()

	# Run the program
	def main():
	try:
	# Option 1: Upload your own CSV file
	print("1. Upload CSV file")
	print("2. Generate Synthetic Data")
	choice = input("Enter choice (1/2): ")
	if choice == '1':
	df = upload_csv_file()
	else:
	df = generate_synthetic_data()
	except Exception as e:
	print(f"Error uploading CSV, falling back to synthetic data: {e}")
	df = generate_synthetic_data()

	# Train the model
	model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df)

	# Display the output
	display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df)

	# Execute the program
	if _name_ == "_main_":
	main()