import pandas as pd |
import numpy as np |
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV |
from sklearn.ensemble import GradientBoostingClassifier |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score |
from sklearn.preprocessing import StandardScaler, LabelEncoder |
import joblib |
from google.colab import files |
import matplotlib.pyplot as plt |
import seaborn as sns |
from mpl_toolkits.mplot3d import Axes3D |
def upload_csv_file(): |
print("Please upload your CSV file.") |
uploaded = files.upload() |
file_name = next(iter(uploaded)) |
df = pd.read_csv(file_name) |
print("Columns in the uploaded file:") |
print(df.columns) |
return df |
def generate_synthetic_data(): |
np.random.seed(42) |
data = { |
'Age': np.random.randint(40, 80, 100), |
'Gender': np.random.choice(['Male', 'Female'], 100), |
'Cholesterol': np.random.randint(150, 250, 100), |
'Systolic_BP': np.random.randint(90, 180, 100), |
'Diastolic_BP': np.random.randint(60, 120, 100), |
'Cognitive_Score1': np.random.randint(1, 4, 100), |
'Cognitive_Score2': np.random.randint(1, 4, 100), |
'Cognitive_Score3': np.random.randint(1, 4, 100), |
'Family_History': np.random.choice([0, 1], 100), |
'Medical_History': np.random.choice([0, 1], 100), |
'Test_Result': np.random.choice([0, 1], 100), |
'Alzheimers_Diagnosis': np.random.choice([0, 1], 100) |
} |
df = pd.DataFrame(data) |
return df |
def encode_categorical_columns(df): |
categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History'] |
label_encoder = LabelEncoder() |
for col in categorical_columns: |
if col in df.columns: |
df[col] = label_encoder.fit_transform(df[col]) |
return df |
def train_model(df): |
if 'Alzheimers_Diagnosis' not in df.columns: |
raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.") |
features = df.drop(columns=['Alzheimers_Diagnosis']) |
target = df['Alzheimers_Diagnosis'] |
df = encode_categorical_columns(df) |
features = df.drop(columns=['Alzheimers_Diagnosis']) |
if features.isnull().sum().any(): |
print("Warning: Missing values found in features, filling with median.") |
features = features.fillna(features.median()) |
if target.isnull().sum() > 0: |
print("Warning: Missing values found in target, filling with mode.") |
target = target.fillna(target.mode()[0]) |
scaler = StandardScaler() |
features_scaled = scaler.fit_transform(features) |
feature_names = features.columns |
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42) |
print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}") |
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") |
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}") |
model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42) |
param_grid = { |
'n_estimators': [100, 300, 500], |
'learning_rate': [0.01, 0.05, 0.1], |
'max_depth': [3, 4, 5], |
'subsample': [0.8, 0.9, 1.0] |
} |
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1) |
grid_search.fit(X_train, y_train) |
print("Best Hyperparameters found: ", grid_search.best_params_) |
best_model = grid_search.best_estimator_ |
best_model.fit(X_train, y_train) |
y_pred = best_model.predict(X_test) |
accuracy = accuracy_score(y_test, y_pred) |
print(f"Model accuracy: {accuracy * 100:.2f}%") |
cv_scores = cross_val_score(best_model, features_scaled, target, cv=5) |
print(f"Cross-validation scores: {cv_scores}") |
print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%") |
print("Classification Report:") |
print(classification_report(y_test, y_pred)) |
joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl') |
probabilities = best_model.predict_proba(X_test)[:, 1] |
if len(X_test) != len(y_test): |
raise ValueError("X_test and y_test have inconsistent lengths.") |
return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names |
def predict_risk(model, scaler, new_data): |
new_data_scaled = scaler.transform(new_data) |
predictions = model.predict(new_data_scaled) |
probabilities = model.predict_proba(new_data_scaled)[:, 1] |
return predictions, probabilities |
def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df): |
if len(X_test) != len(y_test): |
raise ValueError("X_test and y_test must have the same length.") |
print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):") |
print(y_pred) |
print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):") |
print(probabilities) |
print("\nRisk Insights:") |
for i, prob in enumerate(probabilities): |
if prob < 0.3: |
print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})") |
elif prob < 0.7: |
print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})") |
elif prob < 0.9: |
print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})") |
elif prob < 0.95: |
print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})") |
cm = confusion_matrix(y_test, y_pred) |
plt.figure(figsize=(6, 6)) |
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"]) |
plt.title("Confusion Matrix") |
plt.show() |
fpr, tpr, thresholds = roc_curve(y_test, probabilities) |
roc_auc = auc(fpr, tpr) |
plt.figure(figsize=(8, 6)) |
plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})") |
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--') |
plt.xlim([0.0, 1.0]) |
plt.ylim([0.0, 1.05]) |
plt.xlabel('False Positive Rate') |
plt.ylabel('True Positive Rate') |
plt.title('ROC Curve') |
plt.legend(loc="lower right") |
plt.show() |
precision, recall, _ = precision_recall_curve(y_test, probabilities) |
plt.figure(figsize=(8, 6)) |
plt.plot(recall, precision, color='blue', lw=2) |
plt.xlabel('Recall') |
plt.ylabel('Precision') |
plt.title('Precision-Recall Curve') |
plt.show() |
f1 = f1_score(y_test, y_pred) |
print(f"F1 Score: {f1:.2f}") |
feature_importance = model.feature_importances_ |
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance}) |
feature_df = feature_df.sort_values(by='Importance', ascending=False) |
plt.figure(figsize=(8, 6)) |
sns.barplot(x='Importance', y='Feature', data=feature_df) |
plt.title("Feature Importance") |
plt.show() |
plt.figure(figsize=(8, 6)) |
sns.histplot(df['Age'], kde=True) |
plt.title("Age Distribution") |
plt.xlabel("Age") |
plt.ylabel("Frequency") |
plt.show() |
gender_dist = df['Gender'].value_counts() |
plt.figure(figsize=(6, 6)) |
gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6']) |
plt.title("Gender Distribution") |
plt.ylabel('') |
plt.show() |
fig = plt.figure(figsize=(10, 8)) |
ax = fig.add_subplot(111, projection='3d') |
ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o') |
ax.set_xlabel('Age') |
ax.set_ylabel('Cognitive Score 1') |
ax.set_zlabel('Alzheimer\'s Diagnosis') |
ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)') |
plt.show() |
def main(): |
try: |
print("1. Upload CSV file") |
print("2. Generate Synthetic Data") |
choice = input("Enter choice (1/2): ") |
if choice == '1': |
df = upload_csv_file() |
else: |
df = generate_synthetic_data() |
except Exception as e: |
print(f"Error uploading CSV, falling back to synthetic data: {e}") |
df = generate_synthetic_data() |
model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df) |
display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df) |
if _name_ == "_main_": |
main() |