|
import pandas as pd |
|
import numpy as np |
|
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV |
|
from sklearn.ensemble import GradientBoostingClassifier |
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score |
|
from sklearn.preprocessing import StandardScaler, LabelEncoder |
|
import joblib |
|
from google.colab import files |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from mpl_toolkits.mplot3d import Axes3D |
|
|
|
|
|
def upload_csv_file(): |
|
print("Please upload your CSV file.") |
|
uploaded = files.upload() |
|
|
|
|
|
file_name = next(iter(uploaded)) |
|
df = pd.read_csv(file_name) |
|
|
|
|
|
print("Columns in the uploaded file:") |
|
print(df.columns) |
|
|
|
return df |
|
|
|
|
|
def generate_synthetic_data(): |
|
np.random.seed(42) |
|
|
|
|
|
data = { |
|
'Age': np.random.randint(40, 80, 100), |
|
'Gender': np.random.choice(['Male', 'Female'], 100), |
|
'Cholesterol': np.random.randint(150, 250, 100), |
|
'Systolic_BP': np.random.randint(90, 180, 100), |
|
'Diastolic_BP': np.random.randint(60, 120, 100), |
|
'Cognitive_Score1': np.random.randint(1, 4, 100), |
|
'Cognitive_Score2': np.random.randint(1, 4, 100), |
|
'Cognitive_Score3': np.random.randint(1, 4, 100), |
|
'Family_History': np.random.choice([0, 1], 100), |
|
'Medical_History': np.random.choice([0, 1], 100), |
|
'Test_Result': np.random.choice([0, 1], 100), |
|
'Alzheimers_Diagnosis': np.random.choice([0, 1], 100) |
|
} |
|
|
|
df = pd.DataFrame(data) |
|
return df |
|
|
|
|
|
def encode_categorical_columns(df): |
|
|
|
categorical_columns = ['Gender', 'Test_Result', 'Family_History', 'Medical_History'] |
|
|
|
label_encoder = LabelEncoder() |
|
|
|
for col in categorical_columns: |
|
if col in df.columns: |
|
df[col] = label_encoder.fit_transform(df[col]) |
|
|
|
return df |
|
|
|
|
|
def train_model(df): |
|
|
|
if 'Alzheimers_Diagnosis' not in df.columns: |
|
raise KeyError("The column 'Alzheimers_Diagnosis' is missing from the dataset.") |
|
|
|
|
|
features = df.drop(columns=['Alzheimers_Diagnosis']) |
|
target = df['Alzheimers_Diagnosis'] |
|
|
|
|
|
df = encode_categorical_columns(df) |
|
|
|
|
|
features = df.drop(columns=['Alzheimers_Diagnosis']) |
|
|
|
|
|
if features.isnull().sum().any(): |
|
print("Warning: Missing values found in features, filling with median.") |
|
features = features.fillna(features.median()) |
|
|
|
if target.isnull().sum() > 0: |
|
print("Warning: Missing values found in target, filling with mode.") |
|
target = target.fillna(target.mode()[0]) |
|
|
|
|
|
scaler = StandardScaler() |
|
features_scaled = scaler.fit_transform(features) |
|
|
|
|
|
feature_names = features.columns |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42) |
|
|
|
|
|
print(f"Features shape: {features_scaled.shape}, Target shape: {target.shape}") |
|
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") |
|
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}") |
|
|
|
|
|
model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42) |
|
|
|
|
|
param_grid = { |
|
'n_estimators': [100, 300, 500], |
|
'learning_rate': [0.01, 0.05, 0.1], |
|
'max_depth': [3, 4, 5], |
|
'subsample': [0.8, 0.9, 1.0] |
|
} |
|
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1) |
|
grid_search.fit(X_train, y_train) |
|
|
|
print("Best Hyperparameters found: ", grid_search.best_params_) |
|
|
|
|
|
best_model = grid_search.best_estimator_ |
|
|
|
|
|
best_model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = best_model.predict(X_test) |
|
|
|
|
|
accuracy = accuracy_score(y_test, y_pred) |
|
print(f"Model accuracy: {accuracy * 100:.2f}%") |
|
|
|
|
|
cv_scores = cross_val_score(best_model, features_scaled, target, cv=5) |
|
print(f"Cross-validation scores: {cv_scores}") |
|
print(f"Average Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%") |
|
|
|
|
|
print("Classification Report:") |
|
print(classification_report(y_test, y_pred)) |
|
|
|
|
|
joblib.dump(best_model, 'alzheimer_model_gbc_best.pkl') |
|
|
|
|
|
probabilities = best_model.predict_proba(X_test)[:, 1] |
|
|
|
|
|
if len(X_test) != len(y_test): |
|
raise ValueError("X_test and y_test have inconsistent lengths.") |
|
|
|
return best_model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names |
|
|
|
|
|
def predict_risk(model, scaler, new_data): |
|
|
|
new_data_scaled = scaler.transform(new_data) |
|
|
|
|
|
predictions = model.predict(new_data_scaled) |
|
|
|
|
|
probabilities = model.predict_proba(new_data_scaled)[:, 1] |
|
|
|
return predictions, probabilities |
|
|
|
|
|
def display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df): |
|
|
|
if len(X_test) != len(y_test): |
|
raise ValueError("X_test and y_test must have the same length.") |
|
|
|
|
|
print("\nPredictions (0: No, 1: Yes for Alzheimer's risk):") |
|
print(y_pred) |
|
|
|
print("\nPredicted probabilities for Alzheimer's (0 to 1 scale):") |
|
print(probabilities) |
|
|
|
|
|
print("\nRisk Insights:") |
|
for i, prob in enumerate(probabilities): |
|
if prob < 0.3: |
|
print(f"Individual {i+1}: Healthy (Probability: {prob:.2f})") |
|
elif prob < 0.7: |
|
print(f"Individual {i+1}: At Low Risk (Probability: {prob:.2f})") |
|
elif prob < 0.9: |
|
print(f"Individual {i+1}: At High Risk (Probability: {prob:.2f})") |
|
elif prob < 0.95: |
|
print(f"Individual {i+1}: Severe Alzheimer's (Probability: {prob:.2f})") |
|
|
|
|
|
cm = confusion_matrix(y_test, y_pred) |
|
plt.figure(figsize=(6, 6)) |
|
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"]) |
|
plt.title("Confusion Matrix") |
|
plt.show() |
|
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_test, probabilities) |
|
roc_auc = auc(fpr, tpr) |
|
plt.figure(figsize=(8, 6)) |
|
plt.plot(fpr, tpr, color='blue', lw=2, label=f"ROC curve (area = {roc_auc:.2f})") |
|
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--') |
|
plt.xlim([0.0, 1.0]) |
|
plt.ylim([0.0, 1.05]) |
|
plt.xlabel('False Positive Rate') |
|
plt.ylabel('True Positive Rate') |
|
plt.title('ROC Curve') |
|
plt.legend(loc="lower right") |
|
plt.show() |
|
|
|
|
|
precision, recall, _ = precision_recall_curve(y_test, probabilities) |
|
plt.figure(figsize=(8, 6)) |
|
plt.plot(recall, precision, color='blue', lw=2) |
|
plt.xlabel('Recall') |
|
plt.ylabel('Precision') |
|
plt.title('Precision-Recall Curve') |
|
plt.show() |
|
|
|
|
|
f1 = f1_score(y_test, y_pred) |
|
print(f"F1 Score: {f1:.2f}") |
|
|
|
|
|
feature_importance = model.feature_importances_ |
|
feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance}) |
|
feature_df = feature_df.sort_values(by='Importance', ascending=False) |
|
|
|
plt.figure(figsize=(8, 6)) |
|
sns.barplot(x='Importance', y='Feature', data=feature_df) |
|
plt.title("Feature Importance") |
|
plt.show() |
|
|
|
|
|
plt.figure(figsize=(8, 6)) |
|
sns.histplot(df['Age'], kde=True) |
|
plt.title("Age Distribution") |
|
plt.xlabel("Age") |
|
plt.ylabel("Frequency") |
|
plt.show() |
|
|
|
|
|
gender_dist = df['Gender'].value_counts() |
|
plt.figure(figsize=(6, 6)) |
|
gender_dist.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff','#ffb3e6']) |
|
plt.title("Gender Distribution") |
|
plt.ylabel('') |
|
plt.show() |
|
|
|
|
|
fig = plt.figure(figsize=(10, 8)) |
|
ax = fig.add_subplot(111, projection='3d') |
|
ax.scatter(df['Age'], df['Cognitive_Score1'], df['Alzheimers_Diagnosis'], c=df['Alzheimers_Diagnosis'], cmap='coolwarm', marker='o') |
|
ax.set_xlabel('Age') |
|
ax.set_ylabel('Cognitive Score 1') |
|
ax.set_zlabel('Alzheimer\'s Diagnosis') |
|
ax.set_title('3D Scatter Plot (Age vs Cognitive Scores vs Diagnosis)') |
|
plt.show() |
|
|
|
|
|
def main(): |
|
try: |
|
|
|
print("1. Upload CSV file") |
|
print("2. Generate Synthetic Data") |
|
choice = input("Enter choice (1/2): ") |
|
if choice == '1': |
|
df = upload_csv_file() |
|
else: |
|
df = generate_synthetic_data() |
|
except Exception as e: |
|
print(f"Error uploading CSV, falling back to synthetic data: {e}") |
|
df = generate_synthetic_data() |
|
|
|
|
|
model, scaler, X_train, X_test, y_train, y_test, y_pred, accuracy, probabilities, feature_names = train_model(df) |
|
|
|
|
|
display_output(model, X_train, X_test, y_test, y_pred, probabilities, feature_names, df) |
|
|
|
|
|
if _name_ == "_main_": |
|
main() |