File size: 10,868 Bytes
031fd33 7e62018 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
# Step 1: Upload CSV File (if user wants to)
def upload_csv_file():
print("Please upload your CSV file.")
uploaded = files.upload()
# Assuming a single CSV file is uploaded
file_name = next(iter(uploaded)) # Get the name of the uploaded file
df = pd.read_csv(file_name)
# Print the column names to check for mismatches
print("Columns in the uploaded file:")
print(df.columns)
return df
# Step 2: Generate Synthetic Dataset (if no CSV uploaded)
def generate_synthetic_data():
np.random.seed(42)
# Synthetic data (100 rows, 11 columns)
data = {
'Age': np.random.randint(40, 80, 100),
'Gender': np.random.choice(['Male', 'Female'], 100),
'Cholesterol': np.random.randint(150, 250, 100),
'Systolic_BP': np.random.randint(90, 180, 100),
'Diastolic_BP': np.random.randint(60, 120, 100),
'Cognitive_Score1': np.random.randint(1, 4, 100),
'Cognitive_Score2': np.random.randint(1, 4, 100),
'Cognitive_Score3': np.random.randint(1, 4, 100),
'Family_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes
'Medical_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes
'Test_Result': np.random.choice([0, 1], 100), # 0: Negative, 1: Positive
'Alzheimers_Diagnosis': np.random.choice([0, 1], 100) # 0: No, 1: Yes
}
df = pd.DataFrame(data)
return df
# Step 3: Encode categorical variables
def encode_categorical_columns(df):
# List of categorical columns (to be encoded into numeric)
categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History']
label_encoder = LabelEncoder()
for col in categorical_columns:
if col in df.columns:
df[col] = label_encoder.fit_transform(df[col])
return df
# Step 4: Train a model with the dataset using GradientBoostingClassifier
def train_model(df):
# Check if 'Alzheimers_Diagnosis' exists
if 'Alzheimers_Diagnosis' not in df.columns:
raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.")
# Feature selection (exclude 'Alzheimers_Diagnosis' as target)
features = df.drop(columns=['Alzheimers_Diagnosis'])
target = df['Alzheimers_Diagnosis']
# Encode categorical columns to numeric
df = encode_categorical_columns(df)
# Ensure all data passed to the scaler is numeric
features = df.drop(columns=['Alzheimers_Diagnosis']) # Drop target column from features
# Check for missing values
if features.isnull().sum().any():
print("Warning: Missing values found in features, filling with median.")
features = features.fillna(features.median())
if target.isnull().sum() > 0:
print("Warning: Missing values found in target, filling with mode.")
target = target.fillna(target.mode()[0])
# Scale the data (important for some features like 'Age', 'Blood Pressure')
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
# Save the original feature names for plotting feature importance
feature_names = features.columns # Get original column names before scaling
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)
# Ensure all arrays have the same length
print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
# Initialize GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42)
# Hyperparameter tuning using GridSearchCV
param_grid = {
'n_estimators': [100, 300, 500],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 4, 5],
'subsample': [0.8, 0.9, 1.0]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters found: ", grid_search.best_params_)
# Use the best model from GridSearchCV
best_model = grid_search.best_estimator_
# Train the best model
best_model.fit(X_train, y_train)
# Make predictions
y_pred = best_model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")
# Cross-validation to get a better estimate of model performance
cv_scores = cross_val_score(best_model, features_scaled, target, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%")
# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Save the trained model
joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl')
# Return the trained model for further use, along with test data for visualization
probabilities = best_model.predict_proba(X_test)[:, 1] # Get probability of class '1' (Alzheimer's diagnosis)
# Ensure that the length of X_test and y_test are consistent
if len(X_test) != len(y_test):
raise ValueError("X_test and y_test have inconsistent lengths.")
return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names
# Step 5: Predict Alzheimer's risk based on new data
def predict_risk(model, scaler, new_data):
# Scale the new data using the same scaler
new_data_scaled = scaler.transform(new_data)
# Predict Alzheimer's risk based on new data
predictions = model.predict(new_data_scaled)
# Predict probabilities for more interesting output
probabilities = model.predict_proba(new_data_scaled)[:, 1] # Probability of class '1' (Alzheimer's diagnosis)
return predictions, probabilities
# Step 6: Display predictions and feature importance
def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df):
# Ensure consistent lengths
if len(X_test) != len(y_test):
raise ValueError("X_test and y_test must have the same length.")
# Display predictions and probabilities
print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):")
print(y_pred)
print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):")
print(probabilities)
# Display interesting insights based on probabilities
print("\nRisk Insights:")
for i, prob in enumerate(probabilities):
if prob < 0.3:
print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})")
elif prob < 0.7:
print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})")
elif prob < 0.9:
print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})")
elif prob < 0.95:
print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})")
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
plt.title("Confusion Matrix")
plt.show()
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, probabilities)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, probabilities)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()
# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")
# Plot Feature Importance
feature_importance = model.feature_importances_
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_df = feature_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(8, 6))
sns.barplot(x='Importance', y='Feature', data=feature_df)
plt.title("Feature Importance")
plt.show()
# Age Distribution Graph
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'], kde=True)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()
# Pie chart for Gender distribution
gender_dist = df['Gender'].value_counts()
plt.figure(figsize=(6, 6))
gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6'])
plt.title("Gender Distribution")
plt.ylabel('')
plt.show()
# 3D Scatter plot for Age vs Cognitive Scores vs Diagnosis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o')
ax.set_xlabel('Age')
ax.set_ylabel('Cognitive Score 1')
ax.set_zlabel('Alzheimer\'s Diagnosis')
ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)')
plt.show()
# Run the program
def main():
try:
# Option 1: Upload your own CSV file
print("1. Upload CSV file")
print("2. Generate Synthetic Data")
choice = input("Enter choice (1/2): ")
if choice == '1':
df = upload_csv_file()
else:
df = generate_synthetic_data()
except Exception as e:
print(f"Error uploading CSV, falling back to synthetic data: {e}")
df = generate_synthetic_data()
# Train the model
model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df)
# Display the output
display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df)
# Execute the program
if _name_ == "_main_":
main() |