File size: 10,868 Bytes
031fd33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e62018
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score 
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

# Step 1: Upload CSV File (if user wants to)
def upload_csv_file():
    print("Please upload your CSV file.")
    uploaded = files.upload()

    # Assuming a single CSV file is uploaded
    file_name = next(iter(uploaded))  # Get the name of the uploaded file
    df = pd.read_csv(file_name)

    # Print the column names to check for mismatches
    print("Columns in the uploaded file:")
    print(df.columns)

    return df

# Step 2: Generate Synthetic Dataset (if no CSV uploaded)
def generate_synthetic_data():
    np.random.seed(42)

    # Synthetic data (100 rows, 11 columns)
    data = {
        'Age': np.random.randint(40, 80, 100),
        'Gender': np.random.choice(['Male', 'Female'], 100),
        'Cholesterol': np.random.randint(150, 250, 100),
        'Systolic_BP': np.random.randint(90, 180, 100),
        'Diastolic_BP': np.random.randint(60, 120, 100),
        'Cognitive_Score1': np.random.randint(1, 4, 100),
        'Cognitive_Score2': np.random.randint(1, 4, 100),
        'Cognitive_Score3': np.random.randint(1, 4, 100),
        'Family_History': np.random.choice([0, 1], 100),  # 0: No, 1: Yes
        'Medical_History': np.random.choice([0, 1], 100),  # 0: No, 1: Yes
        'Test_Result': np.random.choice([0, 1], 100),  # 0: Negative, 1: Positive
        'Alzheimers_Diagnosis': np.random.choice([0, 1], 100)  # 0: No, 1: Yes
    }

    df = pd.DataFrame(data)
    return df

# Step 3: Encode categorical variables
def encode_categorical_columns(df):
    # List of categorical columns (to be encoded into numeric)
    categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History']

    label_encoder = LabelEncoder()

    for col in categorical_columns:
        if col in df.columns:
            df[col] = label_encoder.fit_transform(df[col])

    return df

# Step 4: Train a model with the dataset using GradientBoostingClassifier
def train_model(df):
    # Check if 'Alzheimers_Diagnosis' exists
    if 'Alzheimers_Diagnosis' not in df.columns:
        raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.")

    # Feature selection (exclude 'Alzheimers_Diagnosis' as target)
    features = df.drop(columns=['Alzheimers_Diagnosis'])
    target = df['Alzheimers_Diagnosis']

    # Encode categorical columns to numeric
    df = encode_categorical_columns(df)

    # Ensure all data passed to the scaler is numeric
    features = df.drop(columns=['Alzheimers_Diagnosis'])  # Drop target column from features

    # Check for missing values
    if features.isnull().sum().any():
        print("Warning: Missing values found in features, filling with median.")
        features = features.fillna(features.median())

    if target.isnull().sum() > 0:
        print("Warning: Missing values found in target, filling with mode.")
        target = target.fillna(target.mode()[0])

    # Scale the data (important for some features like 'Age', 'Blood Pressure')
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # Save the original feature names for plotting feature importance
    feature_names = features.columns  # Get original column names before scaling

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

    # Ensure all arrays have the same length
    print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Initialize GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42)

    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5],
        'subsample': [0.8, 0.9, 1.0]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print("Best Hyperparameters found: ", grid_search.best_params_)

    # Use the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Train the best model
    best_model.fit(X_train, y_train)

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy * 100:.2f}%")

    # Cross-validation to get a better estimate of model performance
    cv_scores = cross_val_score(best_model, features_scaled, target, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%")

    # Display classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Save the trained model
    joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl')

    # Return the trained model for further use, along with test data for visualization
    probabilities = best_model.predict_proba(X_test)[:, 1]  # Get probability of class '1' (Alzheimer's diagnosis)

    # Ensure that the length of X_test and y_test are consistent
    if len(X_test) != len(y_test):
        raise ValueError("X_test and y_test have inconsistent lengths.")

    return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names

# Step 5: Predict Alzheimer's risk based on new data
def predict_risk(model, scaler, new_data):
    # Scale the new data using the same scaler
    new_data_scaled = scaler.transform(new_data)

    # Predict Alzheimer's risk based on new data
    predictions = model.predict(new_data_scaled)

    # Predict probabilities for more interesting output
    probabilities = model.predict_proba(new_data_scaled)[:, 1]  # Probability of class '1' (Alzheimer's diagnosis)

    return predictions, probabilities

# Step 6: Display predictions and feature importance
def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df):
    # Ensure consistent lengths
    if len(X_test) != len(y_test):
        raise ValueError("X_test and y_test must have the same length.")

    # Display predictions and probabilities
    print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):")
    print(y_pred)

    print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):")
    print(probabilities)

    # Display interesting insights based on probabilities
    print("\nRisk Insights:")
    for i, prob in enumerate(probabilities):
        if prob < 0.3:
            print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})")
        elif prob < 0.7:
            print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})")
        elif prob < 0.9:
            print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})")
        elif prob < 0.95:
            print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title("Confusion Matrix")
    plt.show()

    # ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, probabilities)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, probabilities)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='blue', lw=2)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

    # F1 Score
    f1 = f1_score(y_test, y_pred)
    print(f"F1 Score: {f1:.2f}")

    # Plot Feature Importance
    feature_importance = model.feature_importances_
    feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    feature_df = feature_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(8, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_df)
    plt.title("Feature Importance")
    plt.show()

    # Age Distribution Graph
    plt.figure(figsize=(8, 6))
    sns.histplot(df['Age'], kde=True)
    plt.title("Age Distribution")
    plt.xlabel("Age")
    plt.ylabel("Frequency")
    plt.show()

    # Pie chart for Gender distribution
    gender_dist = df['Gender'].value_counts()
    plt.figure(figsize=(6, 6))
    gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6'])
    plt.title("Gender Distribution")
    plt.ylabel('')
    plt.show()

    # 3D Scatter plot for Age vs Cognitive Scores vs Diagnosis
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o')
    ax.set_xlabel('Age')
    ax.set_ylabel('Cognitive Score 1')
    ax.set_zlabel('Alzheimer\'s Diagnosis')
    ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)')
    plt.show()

# Run the program
def main():
    try:
        # Option 1: Upload your own CSV file
        print("1. Upload CSV file")
        print("2. Generate Synthetic Data")
        choice = input("Enter choice (1/2): ")
        if choice == '1':
            df = upload_csv_file()
        else:
            df = generate_synthetic_data()
    except Exception as e:
        print(f"Error uploading CSV, falling back to synthetic data: {e}")
        df = generate_synthetic_data()

    # Train the model
    model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df)

    # Display the output
    display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df)

# Execute the program
if _name_ == "_main_":
    main()