Spencer525's picture
Create app.py
cfab2e3 verified
raw
history blame
3.78 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif
import io
import base64
# Function to create a download link
def get_download_link(data, filename, text):
b64 = base64.b64encode(data).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>'
return href
# Function to plot correlation matrix
def plot_correlation_matrix(data):
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
st.pyplot(plt)
# Function to calculate feature importance
def calculate_feature_importance(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
methods = {
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"XGBoost": XGBClassifier(random_state=42)
}
importance_dict = {}
for name, model in methods.items():
model.fit(X_train_scaled, y_train)
importance_dict[name] = model.feature_importances_
# Permutation Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
perm_importance = permutation_importance(rf, X_test_scaled, y_test, n_repeats=10, random_state=42)
importance_dict["Permutation"] = perm_importance.importances_mean
# Mutual Information
mi_scores = mutual_info_classif(X_train_scaled, y_train)
importance_dict["Mutual Information"] = mi_scores
return importance_dict
# Streamlit app
st.title('Heart Disease Feature Analysis')
# File upload
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
data = pd.read_csv(uploaded_file)
st.write("Data Preview:")
st.write(data.head())
# Select target variable
target_col = st.selectbox("Select the target variable", data.columns)
if st.button('Analyze'):
X = data.drop(target_col, axis=1)
y = data[target_col]
# Correlation Matrix
st.subheader('Correlation Matrix')
plot_correlation_matrix(data)
# Download correlation matrix as PNG
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
st.markdown(get_download_link(buf.getvalue(), "correlation_matrix.png", "Download Correlation Matrix as PNG"), unsafe_allow_html=True)
# Feature Importance
st.subheader('Feature Importance')
importance_dict = calculate_feature_importance(X, y)
# Create a DataFrame with all feature importances
importance_df = pd.DataFrame(importance_dict, index=X.columns)
st.write(importance_df)
# Download feature importance as XLSX
excel_buffer = io.BytesIO()
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
importance_df.to_excel(writer, sheet_name='Feature Importance')
excel_buffer.seek(0)
st.markdown(get_download_link(excel_buffer.getvalue(), "feature_importance.xlsx", "Download Feature Importance as XLSX"), unsafe_allow_html=True)
else:
st.write("Please upload a CSV file to begin the analysis.")