test / app.py
drwaseem's picture
Update app.py
7e62018 verified
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
# Step 1: Upload CSV File (if user wants to)
def upload_csv_file():
print("Please upload your CSV file.")
uploaded = files.upload()
# Assuming a single CSV file is uploaded
file_name = next(iter(uploaded)) # Get the name of the uploaded file
df = pd.read_csv(file_name)
# Print the column names to check for mismatches
print("Columns in the uploaded file:")
print(df.columns)
return df
# Step 2: Generate Synthetic Dataset (if no CSV uploaded)
def generate_synthetic_data():
np.random.seed(42)
# Synthetic data (100 rows, 11 columns)
data = {
'Age': np.random.randint(40, 80, 100),
'Gender': np.random.choice(['Male', 'Female'], 100),
'Cholesterol': np.random.randint(150, 250, 100),
'Systolic_BP': np.random.randint(90, 180, 100),
'Diastolic_BP': np.random.randint(60, 120, 100),
'Cognitive_Score1': np.random.randint(1, 4, 100),
'Cognitive_Score2': np.random.randint(1, 4, 100),
'Cognitive_Score3': np.random.randint(1, 4, 100),
'Family_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes
'Medical_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes
'Test_Result': np.random.choice([0, 1], 100), # 0: Negative, 1: Positive
'Alzheimers_Diagnosis': np.random.choice([0, 1], 100) # 0: No, 1: Yes
}
df = pd.DataFrame(data)
return df
# Step 3: Encode categorical variables
def encode_categorical_columns(df):
# List of categorical columns (to be encoded into numeric)
categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History']
label_encoder = LabelEncoder()
for col in categorical_columns:
if col in df.columns:
df[col] = label_encoder.fit_transform(df[col])
return df
# Step 4: Train a model with the dataset using GradientBoostingClassifier
def train_model(df):
# Check if 'Alzheimers_Diagnosis' exists
if 'Alzheimers_Diagnosis' not in df.columns:
raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.")
# Feature selection (exclude 'Alzheimers_Diagnosis' as target)
features = df.drop(columns=['Alzheimers_Diagnosis'])
target = df['Alzheimers_Diagnosis']
# Encode categorical columns to numeric
df = encode_categorical_columns(df)
# Ensure all data passed to the scaler is numeric
features = df.drop(columns=['Alzheimers_Diagnosis']) # Drop target column from features
# Check for missing values
if features.isnull().sum().any():
print("Warning: Missing values found in features, filling with median.")
features = features.fillna(features.median())
if target.isnull().sum() > 0:
print("Warning: Missing values found in target, filling with mode.")
target = target.fillna(target.mode()[0])
# Scale the data (important for some features like 'Age', 'Blood Pressure')
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
# Save the original feature names for plotting feature importance
feature_names = features.columns # Get original column names before scaling
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)
# Ensure all arrays have the same length
print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
# Initialize GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42)
# Hyperparameter tuning using GridSearchCV
param_grid = {
'n_estimators': [100, 300, 500],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 4, 5],
'subsample': [0.8, 0.9, 1.0]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters found: ", grid_search.best_params_)
# Use the best model from GridSearchCV
best_model = grid_search.best_estimator_
# Train the best model
best_model.fit(X_train, y_train)
# Make predictions
y_pred = best_model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")
# Cross-validation to get a better estimate of model performance
cv_scores = cross_val_score(best_model, features_scaled, target, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%")
# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Save the trained model
joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl')
# Return the trained model for further use, along with test data for visualization
probabilities = best_model.predict_proba(X_test)[:, 1] # Get probability of class '1' (Alzheimer's diagnosis)
# Ensure that the length of X_test and y_test are consistent
if len(X_test) != len(y_test):
raise ValueError("X_test and y_test have inconsistent lengths.")
return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names
# Step 5: Predict Alzheimer's risk based on new data
def predict_risk(model, scaler, new_data):
# Scale the new data using the same scaler
new_data_scaled = scaler.transform(new_data)
# Predict Alzheimer's risk based on new data
predictions = model.predict(new_data_scaled)
# Predict probabilities for more interesting output
probabilities = model.predict_proba(new_data_scaled)[:, 1] # Probability of class '1' (Alzheimer's diagnosis)
return predictions, probabilities
# Step 6: Display predictions and feature importance
def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df):
# Ensure consistent lengths
if len(X_test) != len(y_test):
raise ValueError("X_test and y_test must have the same length.")
# Display predictions and probabilities
print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):")
print(y_pred)
print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):")
print(probabilities)
# Display interesting insights based on probabilities
print("\nRisk Insights:")
for i, prob in enumerate(probabilities):
if prob < 0.3:
print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})")
elif prob < 0.7:
print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})")
elif prob < 0.9:
print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})")
elif prob < 0.95:
print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})")
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
plt.title("Confusion Matrix")
plt.show()
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, probabilities)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, probabilities)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()
# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")
# Plot Feature Importance
feature_importance = model.feature_importances_
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_df = feature_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(8, 6))
sns.barplot(x='Importance', y='Feature', data=feature_df)
plt.title("Feature Importance")
plt.show()
# Age Distribution Graph
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'], kde=True)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()
# Pie chart for Gender distribution
gender_dist = df['Gender'].value_counts()
plt.figure(figsize=(6, 6))
gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6'])
plt.title("Gender Distribution")
plt.ylabel('')
plt.show()
# 3D Scatter plot for Age vs Cognitive Scores vs Diagnosis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o')
ax.set_xlabel('Age')
ax.set_ylabel('Cognitive Score 1')
ax.set_zlabel('Alzheimer\'s Diagnosis')
ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)')
plt.show()
# Run the program
def main():
try:
# Option 1: Upload your own CSV file
print("1. Upload CSV file")
print("2. Generate Synthetic Data")
choice = input("Enter choice (1/2): ")
if choice == '1':
df = upload_csv_file()
else:
df = generate_synthetic_data()
except Exception as e:
print(f"Error uploading CSV, falling back to synthetic data: {e}")
df = generate_synthetic_data()
# Train the model
model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df)
# Display the output
display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df)
# Execute the program
if _name_ == "_main_":
main()