Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Load data | |
data = pd.read_csv("P1G5_Set_1_Titan_Russo.csv") | |
def eda(): | |
st.title("Eksploratory Data Analysis") | |
st.write('Analyze the DataFrame for Better Understanding') | |
st.markdown("<h2><b>Limit Balance vs. Bill Amount by Default Payment Next Month</b></h2>", | |
unsafe_allow_html=True) | |
palette = ["#FF0000", "#4129E1"] # custom colors | |
for i in range(1, 7): | |
plt.figure() | |
sns.scatterplot( | |
x="limit_balance", y=f"bill_amt_{i}", hue="default_payment_next_month", data=data, palette=palette) | |
plt.title( | |
f"Limit Balance vs. Pay Amount {i} by Default Payment Next Month") | |
st.set_option('deprecation.showPyplotGlobalUse', False) | |
st.pyplot() | |
st.write("**Explanation**:") | |
markdown_text = """ | |
These plots indicate that a higher `limit_balance` means a higher likelihood of non defaulting on payments. | |
""" | |
st.markdown(markdown_text) | |
st.markdown("<h2><b>Heatmap of Correlation Matrix</b></h2>", | |
unsafe_allow_html=True) | |
# Heatmap | |
corr_matrix = data.corr() | |
plt.figure(figsize=(15, 10)) | |
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f") | |
plt.title('Heatmap of Correlation Matrix') | |
st.set_option('deprecation.showPyplotGlobalUse', False) | |
st.pyplot() | |
st.write("") # Add a blank line | |
st.write("**Explanation**:") | |
markdown_text = """ | |
From the heatmap we can see the correlation between each columns. We can see the `pay_0`, `pay_2`, `pay_3`, `pay_4`, `pay_5`, `pay_6` have correlation each others from categorical columns meanwhile we can see `bill_amt_1`, `bill_amt_2`, `bill_amt_3`, `bill_amt_4`, `bill_amt_5`, `bill_amt_6` have correlation each others from numerical columns | |
""" | |
st.markdown(markdown_text) | |
st.markdown("<h2><b>Checking Distribution Data</b></h2>", | |
unsafe_allow_html=True) | |
# Columns to plot | |
cols_num = ['limit_balance', 'age', 'bill_amt_1', | |
'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6', | |
'pay_amt_1', 'pay_amt_2', 'pay_amt_3', 'pay_amt_4', 'pay_amt_5', | |
'pay_amt_6'] | |
# creating subplots for histogram | |
fig, axes = plt.subplots(5, 4, figsize=(18, 15)) | |
# Flatten axes array | |
axes = axes.flatten() | |
# p;ots for each column | |
for i, col in enumerate(cols_num): | |
# membuat histogram dengan kernel density estimate | |
sns.histplot(data[col], ax=axes[i], kde=True) | |
axes[i].set_title(f'Distribution {col}') | |
axes[i].set_xlabel(col) | |
axes[i].set_ylabel('Frequency') | |
# hapus figure | |
for j in range(len(cols_num), len(axes)): | |
axes[j].remove() | |
# display | |
plt.tight_layout() | |
st.set_option('deprecation.showPyplotGlobalUse', False) | |
st.pyplot() | |
st.write("") # Add a blank line | |
st.write("**Explanation**:") | |
st.write('Checking the distribution data from non categorical columns, we can say the data is positive skewed') | |
st.write() | |