Spaces:

drwaseem
/

test

Runtime error

File size: 10,868 Bytes

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score 
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

# Step 1: Upload CSV File (if user wants to)
def upload_csv_file():
    print("Please upload your CSV file.")
    uploaded = files.upload()

    # Assuming a single CSV file is uploaded
    file_name = next(iter(uploaded))  # Get the name of the uploaded file
    df = pd.read_csv(file_name)

    # Print the column names to check for mismatches
    print("Columns in the uploaded file:")
    print(df.columns)

    return df

# Step 2: Generate Synthetic Dataset (if no CSV uploaded)
def generate_synthetic_data():
    np.random.seed(42)

    # Synthetic data (100 rows, 11 columns)
    data = {
        'Age': np.random.randint(40, 80, 100),
        'Gender': np.random.choice(['Male', 'Female'], 100),
        'Cholesterol': np.random.randint(150, 250, 100),
        'Systolic_BP': np.random.randint(90, 180, 100),
        'Diastolic_BP': np.random.randint(60, 120, 100),
        'Cognitive_Score1': np.random.randint(1, 4, 100),
        'Cognitive_Score2': np.random.randint(1, 4, 100),
        'Cognitive_Score3': np.random.randint(1, 4, 100),
        'Family_History': np.random.choice([0, 1], 100),  # 0: No, 1: Yes
        'Medical_History': np.random.choice([0, 1], 100),  # 0: No, 1: Yes
        'Test_Result': np.random.choice([0, 1], 100),  # 0: Negative, 1: Positive
        'Alzheimers_Diagnosis': np.random.choice([0, 1], 100)  # 0: No, 1: Yes
    }

    df = pd.DataFrame(data)
    return df

# Step 3: Encode categorical variables
def encode_categorical_columns(df):
    # List of categorical columns (to be encoded into numeric)
    categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History']

    label_encoder = LabelEncoder()

    for col in categorical_columns:
        if col in df.columns:
            df[col] = label_encoder.fit_transform(df[col])

    return df

# Step 4: Train a model with the dataset using GradientBoostingClassifier
def train_model(df):
    # Check if 'Alzheimers_Diagnosis' exists
    if 'Alzheimers_Diagnosis' not in df.columns:
        raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.")

    # Feature selection (exclude 'Alzheimers_Diagnosis' as target)
    features = df.drop(columns=['Alzheimers_Diagnosis'])
    target = df['Alzheimers_Diagnosis']

    # Encode categorical columns to numeric
    df = encode_categorical_columns(df)

    # Ensure all data passed to the scaler is numeric
    features = df.drop(columns=['Alzheimers_Diagnosis'])  # Drop target column from features

    # Check for missing values
    if features.isnull().sum().any():
        print("Warning: Missing values found in features, filling with median.")
        features = features.fillna(features.median())

    if target.isnull().sum() > 0:
        print("Warning: Missing values found in target, filling with mode.")
        target = target.fillna(target.mode()[0])

    # Scale the data (important for some features like 'Age', 'Blood Pressure')
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # Save the original feature names for plotting feature importance
    feature_names = features.columns  # Get original column names before scaling

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

    # Ensure all arrays have the same length
    print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Initialize GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42)

    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5],
        'subsample': [0.8, 0.9, 1.0]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print("Best Hyperparameters found: ", grid_search.best_params_)

    # Use the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Train the best model
    best_model.fit(X_train, y_train)

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy * 100:.2f}%")

    # Cross-validation to get a better estimate of model performance
    cv_scores = cross_val_score(best_model, features_scaled, target, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%")

    # Display classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Save the trained model
    joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl')

    # Return the trained model for further use, along with test data for visualization
    probabilities = best_model.predict_proba(X_test)[:, 1]  # Get probability of class '1' (Alzheimer's diagnosis)

    # Ensure that the length of X_test and y_test are consistent
    if len(X_test) != len(y_test):
        raise ValueError("X_test and y_test have inconsistent lengths.")

    return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names

# Step 5: Predict Alzheimer's risk based on new data
def predict_risk(model, scaler, new_data):
    # Scale the new data using the same scaler
    new_data_scaled = scaler.transform(new_data)

    # Predict Alzheimer's risk based on new data
    predictions = model.predict(new_data_scaled)

    # Predict probabilities for more interesting output
    probabilities = model.predict_proba(new_data_scaled)[:, 1]  # Probability of class '1' (Alzheimer's diagnosis)

    return predictions, probabilities

# Step 6: Display predictions and feature importance
def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df):
    # Ensure consistent lengths
    if len(X_test) != len(y_test):
        raise ValueError("X_test and y_test must have the same length.")

    # Display predictions and probabilities
    print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):")
    print(y_pred)

    print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):")
    print(probabilities)

    # Display interesting insights based on probabilities
    print("\nRisk Insights:")
    for i, prob in enumerate(probabilities):
        if prob < 0.3:
            print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})")
        elif prob < 0.7:
            print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})")
        elif prob < 0.9:
            print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})")
        elif prob < 0.95:
            print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title("Confusion Matrix")
    plt.show()

    # ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, probabilities)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, probabilities)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='blue', lw=2)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

    # F1 Score
    f1 = f1_score(y_test, y_pred)
    print(f"F1 Score: {f1:.2f}")

    # Plot Feature Importance
    feature_importance = model.feature_importances_
    feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    feature_df = feature_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(8, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_df)
    plt.title("Feature Importance")
    plt.show()

    # Age Distribution Graph
    plt.figure(figsize=(8, 6))
    sns.histplot(df['Age'], kde=True)
    plt.title("Age Distribution")
    plt.xlabel("Age")
    plt.ylabel("Frequency")
    plt.show()

    # Pie chart for Gender distribution
    gender_dist = df['Gender'].value_counts()
    plt.figure(figsize=(6, 6))
    gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6'])
    plt.title("Gender Distribution")
    plt.ylabel('')
    plt.show()

    # 3D Scatter plot for Age vs Cognitive Scores vs Diagnosis
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o')
    ax.set_xlabel('Age')
    ax.set_ylabel('Cognitive Score 1')
    ax.set_zlabel('Alzheimer\'s Diagnosis')
    ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)')
    plt.show()

# Run the program
def main():
    try:
        # Option 1: Upload your own CSV file
        print("1. Upload CSV file")
        print("2. Generate Synthetic Data")
        choice = input("Enter choice (1/2): ")
        if choice == '1':
            df = upload_csv_file()
        else:
            df = generate_synthetic_data()
    except Exception as e:
        print(f"Error uploading CSV, falling back to synthetic data: {e}")
        df = generate_synthetic_data()

    # Train the model
    model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df)

    # Display the output
    display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df)

# Execute the program
if _name_ == "_main_":
    main()