iDrops's picture
Upload 5 files
4b9b315 verified
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
# Check for missing values
#Loading Data
data = pd.read_csv('Cardio_Vascular_Disease_by_Gut_Microbiota.csv')
print(data.head())
from sklearn.ensemble import RandomForestClassifier
# Define features and target
X = data.drop(columns=['patient_id', 'CVD_Status'])
y = data['CVD_Status']
# Train a RandomForest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
# Feature importances
importances = rf.feature_importances_
# Plot feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance from Random Forest')
plt.show()
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
# Initialize the models
gradient_boosting = GradientBoostingClassifier(random_state=42)
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train and evaluate Gradient Boosting
gradient_boosting.fit(X_train, y_train)
y_pred_gb = gradient_boosting.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
conf_matrix_gb = confusion_matrix(y_test, y_pred_gb)
# Print results
print(f"Gradient Boosting Accuracy: {accuracy_gb * 100:.2f}%")
print(f"Confusion Matrix:\n {conf_matrix_gb}\n")
# Predict probabilities
y_pred_prob_gb = gradient_boosting.predict_proba(X_test)[:, 1]
# Predict class labels
y_pred_gb = gradient_boosting.predict(X_test)
# Calculate R² Score, RMSE, MSE, and MAE for Gradient Boosting
r2_gb = r2_score(y_test, y_pred_prob_gb)
rmse_gb = sqrt(mean_squared_error(y_test, y_pred_prob_gb))
mse_gb = mean_squared_error(y_test, y_pred_prob_gb)
mae_gb = mean_absolute_error(y_test, y_pred_prob_gb)
# Print Accuracy, R², RMSE, MSE, and MAE for Gradient Boosting
print(f"Gradient Boosting Accuracy: {accuracy_gb * 100:.2f}%")
print(f"R² Score: {r2_gb:.4f}, RMSE: {rmse_gb:.4f}, MSE: {mse_gb:.4f}, MAE: {mae_gb:.4f}")
print(f"Confusion Matrix:\n {conf_matrix_gb}\n")
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# Train and evaluate XGBoost
xgboost.fit(X_train, y_train)
y_pred_xgb = xgboost.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb * 100:.2f}%")
print(f"Confusion Matrix:\n {conf_matrix_xgb}\n")
y_pred_prob_xgb = xgboost.predict_proba(X_test)[:, 1]
y_pred_xgb = xgboost.predict(X_test)
# Calculate R² Score, RMSE, MSE, and MAE for XGBoost
r2_xgb = r2_score(y_test, y_pred_prob_xgb)
rmse_xgb = sqrt(mean_squared_error(y_test, y_pred_prob_xgb))
mse_xgb = mean_squared_error(y_test, y_pred_prob_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_prob_xgb)
# Print Accuracy, R², RMSE, MSE, and MAE for XGBoost
print(f"XGBoost Accuracy: {accuracy_xgb * 100:.2f}%")
print(f"R² Score: {r2_xgb:.4f}, RMSE: {rmse_xgb:.4f}, MSE: {mse_xgb:.4f}, MAE: {mae_xgb:.4f}")
print(f"Confusion Matrix:\n {conf_matrix_xgb}\n")
lightgbm = LGBMClassifier(random_state=42)
# Train and evaluate LightGBM
lightgbm.fit(X_train, y_train)
y_pred_lgbm = lightgbm.predict(X_test)
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
conf_matrix_lgbm = confusion_matrix(y_test, y_pred_lgbm)
print(f"LightGBM Accuracy: {accuracy_lgbm * 100:.2f}%")
print(f"Confusion Matrix:\n {conf_matrix_lgbm}\n")
y_pred_prob_lgbm = lightgbm.predict_proba(X_test)[:, 1]
y_pred_lgbm = lightgbm.predict(X_test)
# Calculate R² Score, RMSE, MSE, and MAE for LightGBM
r2_lgbm = r2_score(y_test, y_pred_prob_lgbm)
rmse_lgbm = sqrt(mean_squared_error(y_test, y_pred_prob_lgbm))
mse_lgbm = mean_squared_error(y_test, y_pred_prob_lgbm)
mae_lgbm = mean_absolute_error(y_test, y_pred_prob_lgbm)
# Print Accuracy, R², RMSE, MSE, and MAE for LightGBM
print(f"LightGBM Accuracy: {accuracy_lgbm * 100:.2f}%")
print(f"R² Score: {r2_lgbm:.4f}, RMSE: {rmse_lgbm:.4f}, MSE: {mse_lgbm:.4f}, MAE: {mae_lgbm:.4f}")
print(f"Confusion Matrix:\n {conf_matrix_lgbm}\n")
import joblib
# Assuming you have already trained the model (e.g., GradientBoostingClassifier, XGBoost, etc.)
# Example with a Gradient Boosting model (replace with your trained model)
from sklearn.ensemble import GradientBoostingClassifier
# Assuming you have trained a model
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train) # Replace this with your actual training code
# Save the trained model as a .pkl file
joblib.dump(model, 'trained_model.pkl')
print("Model saved successfully as trained_model.pkl")
def predict_cvd(Age, Gender, BMI, Blood_pressure, cholesterol, Bacteroides_fragilis, Faecalibacterium_prausnitzii,
Akkermansia_muciniphila, Ruminococcus_bromii, Microbiome_Diversity):
# Convert Gender to numerical (assuming Male: 0, Female: 1)
Gender = 1 if Gender.lower() == 'female' else 0
# Prepare the input data as a dataframe
input_data = pd.DataFrame({
'Age': [Age],
'Gender': [Gender],
'BMI': [BMI],
'Blood_pressure': [Blood_pressure],
'cholesterol': [cholesterol],
'Bacteroides_fragilis': [Bacteroides_fragilis],
'Faecalibacterium_prausnitzii': [Faecalibacterium_prausnitzii],
'Akkermansia_muciniphila': [Akkermansia_muciniphila],
'Ruminococcus_bromii': [Ruminococcus_bromii],
'Microbiome_Diversity': [Microbiome_Diversity]
})
print(input_data) # Print the input to debug
# Predict CVD status (0 or 1)
prediction = model.predict(input_data)
# Return the result
return "Cardiovascular Disease Detected" if prediction[0] == 1 else "No Cardiovascular Disease Detected"
import gradio as gr
import pandas as pd
import joblib
# Load the pre-trained model
model = joblib.load('trained_model.pkl')
# Define the prediction function
def predict_cvd(Age, Gender, BMI, Blood_pressure, Cholesterol, Bacteroides_fragilis, Faecalibacterium_prausnitzii,
Akkermansia_muciniphila, Ruminococcus_bromii, Microbiome_Diversity):
try:
# Convert Gender to numerical (assuming Male: 0, Female: 1)
Gender = 1 if Gender.lower() == 'female' else 0
# Prepare the input data as a dataframe with correctly capitalized feature names
input_data = pd.DataFrame({
'Age': [Age],
'Gender': [Gender],
'BMI': [BMI],
'Blood_pressure': [Blood_pressure],
'Cholesterol': [Cholesterol], # Note the capital "C"
'Bacteroides_fragilis': [Bacteroides_fragilis],
'Faecalibacterium_prausnitzii': [Faecalibacterium_prausnitzii],
'Akkermansia_muciniphila': [Akkermansia_muciniphila],
'Ruminococcus_bromii': [Ruminococcus_bromii],
'Microbiome_Diversity': [Microbiome_Diversity]
})
# Make prediction
prediction = model.predict(input_data)
# Return result based on prediction
return "Cardiovascular Disease Detected" if prediction[0] == 1 else "No Cardiovascular Disease Detected"
except Exception as e:
return f"An error occurred: {str(e)}"
# Define Gradio inputs with proper ranges and selections
inputs = [
gr.Slider(18, 100, step=1, value=50, label="Age"),
gr.Dropdown(['Male', 'Female'], label="Gender"),
gr.Slider(10.0, 50.0, step=0.1, value=25.0, label="BMI"),
gr.Slider(90, 200, step=1, value=120, label="Blood Pressure"),
gr.Slider(100, 300, step=1, value=180, label="Cholesterol"), # Corrected capitalization
gr.Slider(0.0, 10.0, step=0.1, value=5.0, label="Bacteroides Fragilis Level"),
gr.Slider(0.0, 10.0, step=0.1, value=5.0, label="Faecalibacterium Prausnitzii Level"),
gr.Slider(0.0, 10.0, step=0.1, value=5.0, label="Akkermansia Muciniphila Level"),
gr.Slider(0.0, 10.0, step=0.1, value=5.0, label="Ruminococcus Bromii Level"),
gr.Slider(0.0, 10.0, step=0.1, value=5.0, label="Microbiome Diversity"),
]
# Define Gradio interface
iface = gr.Interface(fn=predict_cvd, inputs=inputs, outputs="text", title="Cardiovascular Disease Prediction")
# Launch the interface
iface.launch()