import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score from sklearn.preprocessing import StandardScaler, LabelEncoder import joblib from google.colab import files import matplotlib.pyplot as plt import seaborn as sns from mpl_toolkits.mplot3d import Axes3D # Step 1: Upload CSV File (if user wants to) def upload_csv_file(): print("Please upload your CSV file.") uploaded = files.upload() # Assuming a single CSV file is uploaded file_name = next(iter(uploaded)) # Get the name of the uploaded file df = pd.read_csv(file_name) # Print the column names to check for mismatches print("Columns in the uploaded file:") print(df.columns) return df # Step 2: Generate Synthetic Dataset (if no CSV uploaded) def generate_synthetic_data(): np.random.seed(42) # Synthetic data (100 rows, 11 columns) data = { 'Age': np.random.randint(40, 80, 100), 'Gender': np.random.choice(['Male', 'Female'], 100), 'Cholesterol': np.random.randint(150, 250, 100), 'Systolic_BP': np.random.randint(90, 180, 100), 'Diastolic_BP': np.random.randint(60, 120, 100), 'Cognitive_Score1': np.random.randint(1, 4, 100), 'Cognitive_Score2': np.random.randint(1, 4, 100), 'Cognitive_Score3': np.random.randint(1, 4, 100), 'Family_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes 'Medical_History': np.random.choice([0, 1], 100), # 0: No, 1: Yes 'Test_Result': np.random.choice([0, 1], 100), # 0: Negative, 1: Positive 'Alzheimers_Diagnosis': np.random.choice([0, 1], 100) # 0: No, 1: Yes } df = pd.DataFrame(data) return df # Step 3: Encode categorical variables def encode_categorical_columns(df): # List of categorical columns (to be encoded into numeric) categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History'] label_encoder = LabelEncoder() for col in categorical_columns: if col in df.columns: df[col] = label_encoder.fit_transform(df[col]) return df # Step 4: Train a model with the dataset using GradientBoostingClassifier def train_model(df): # Check if 'Alzheimers_Diagnosis' exists if 'Alzheimers_Diagnosis' not in df.columns: raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.") # Feature selection (exclude 'Alzheimers_Diagnosis' as target) features = df.drop(columns=['Alzheimers_Diagnosis']) target = df['Alzheimers_Diagnosis'] # Encode categorical columns to numeric df = encode_categorical_columns(df) # Ensure all data passed to the scaler is numeric features = df.drop(columns=['Alzheimers_Diagnosis']) # Drop target column from features # Check for missing values if features.isnull().sum().any(): print("Warning: Missing values found in features, filling with median.") features = features.fillna(features.median()) if target.isnull().sum() > 0: print("Warning: Missing values found in target, filling with mode.") target = target.fillna(target.mode()[0]) # Scale the data (important for some features like 'Age', 'Blood Pressure') scaler = StandardScaler() features_scaled = scaler.fit_transform(features) # Save the original feature names for plotting feature importance feature_names = features.columns # Get original column names before scaling # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42) # Ensure all arrays have the same length print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}") print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}") # Initialize GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42) # Hyperparameter tuning using GridSearchCV param_grid = { 'n_estimators': [100, 300, 500], 'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [3, 4, 5], 'subsample': [0.8, 0.9, 1.0] } grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1) grid_search.fit(X_train, y_train) print("Best Hyperparameters found: ", grid_search.best_params_) # Use the best model from GridSearchCV best_model = grid_search.best_estimator_ # Train the best model best_model.fit(X_train, y_train) # Make predictions y_pred = best_model.predict(X_test) # Calculate accuracy accuracy = accuracy_score(y_test, y_pred) print(f"Model accuracy: {accuracy * 100:.2f}%") # Cross-validation to get a better estimate of model performance cv_scores = cross_val_score(best_model, features_scaled, target, cv=5) print(f"Cross-validation scores: {cv_scores}") print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%") # Display classification report print("Classification Report:") print(classification_report(y_test, y_pred)) # Save the trained model joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl') # Return the trained model for further use, along with test data for visualization probabilities = best_model.predict_proba(X_test)[:, 1] # Get probability of class '1' (Alzheimer's diagnosis) # Ensure that the length of X_test and y_test are consistent if len(X_test) != len(y_test): raise ValueError("X_test and y_test have inconsistent lengths.") return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names # Step 5: Predict Alzheimer's risk based on new data def predict_risk(model, scaler, new_data): # Scale the new data using the same scaler new_data_scaled = scaler.transform(new_data) # Predict Alzheimer's risk based on new data predictions = model.predict(new_data_scaled) # Predict probabilities for more interesting output probabilities = model.predict_proba(new_data_scaled)[:, 1] # Probability of class '1' (Alzheimer's diagnosis) return predictions, probabilities # Step 6: Display predictions and feature importance def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df): # Ensure consistent lengths if len(X_test) != len(y_test): raise ValueError("X_test and y_test must have the same length.") # Display predictions and probabilities print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):") print(y_pred) print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):") print(probabilities) # Display interesting insights based on probabilities print("\nRisk Insights:") for i, prob in enumerate(probabilities): if prob < 0.3: print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})") elif prob < 0.7: print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})") elif prob < 0.9: print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})") elif prob < 0.95: print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})") # Confusion Matrix cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(6, 6)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"]) plt.title("Confusion Matrix") plt.show() # ROC Curve fpr, tpr, thresholds = roc_curve(y_test, probabilities) roc_auc = auc(fpr, tpr) plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})") plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show() # Precision-Recall Curve precision, recall, _ = precision_recall_curve(y_test, probabilities) plt.figure(figsize=(8, 6)) plt.plot(recall, precision, color='blue', lw=2) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.show() # F1 Score f1 = f1_score(y_test, y_pred) print(f"F1 Score: {f1:.2f}") # Plot Feature Importance feature_importance = model.feature_importances_ feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance}) feature_df = feature_df.sort_values(by='Importance', ascending=False) plt.figure(figsize=(8, 6)) sns.barplot(x='Importance', y='Feature', data=feature_df) plt.title("Feature Importance") plt.show() # Age Distribution Graph plt.figure(figsize=(8, 6)) sns.histplot(df['Age'], kde=True) plt.title("Age Distribution") plt.xlabel("Age") plt.ylabel("Frequency") plt.show() # Pie chart for Gender distribution gender_dist = df['Gender'].value_counts() plt.figure(figsize=(6, 6)) gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6']) plt.title("Gender Distribution") plt.ylabel('') plt.show() # 3D Scatter plot for Age vs Cognitive Scores vs Diagnosis fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(111, projection='3d') ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o') ax.set_xlabel('Age') ax.set_ylabel('Cognitive Score 1') ax.set_zlabel('Alzheimer\'s Diagnosis') ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)') plt.show() # Run the program def main(): try: # Option 1: Upload your own CSV file print("1. Upload CSV file") print("2. Generate Synthetic Data") choice = input("Enter choice (1/2): ") if choice == '1': df = upload_csv_file() else: df = generate_synthetic_data() except Exception as e: print(f"Error uploading CSV, falling back to synthetic data: {e}") df = generate_synthetic_data() # Train the model model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df) # Display the output display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df) # Execute the program if _name_ == "_main_": main()