TRusso's picture
Upload 7 files
5a7ae2a verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load data
data = pd.read_csv("P1G5_Set_1_Titan_Russo.csv")
def eda():
st.title("Eksploratory Data Analysis")
st.write('Analyze the DataFrame for Better Understanding')
st.markdown("<h2><b>Limit Balance vs. Bill Amount by Default Payment Next Month</b></h2>",
unsafe_allow_html=True)
palette = ["#FF0000", "#4129E1"] # custom colors
for i in range(1, 7):
plt.figure()
sns.scatterplot(
x="limit_balance", y=f"bill_amt_{i}", hue="default_payment_next_month", data=data, palette=palette)
plt.title(
f"Limit Balance vs. Pay Amount {i} by Default Payment Next Month")
st.set_option('deprecation.showPyplotGlobalUse', False)
st.pyplot()
st.write("**Explanation**:")
markdown_text = """
These plots indicate that a higher `limit_balance` means a higher likelihood of non defaulting on payments.
"""
st.markdown(markdown_text)
st.markdown("<h2><b>Heatmap of Correlation Matrix</b></h2>",
unsafe_allow_html=True)
# Heatmap
corr_matrix = data.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Heatmap of Correlation Matrix')
st.set_option('deprecation.showPyplotGlobalUse', False)
st.pyplot()
st.write("") # Add a blank line
st.write("**Explanation**:")
markdown_text = """
From the heatmap we can see the correlation between each columns. We can see the `pay_0`, `pay_2`, `pay_3`, `pay_4`, `pay_5`, `pay_6` have correlation each others from categorical columns meanwhile we can see `bill_amt_1`, `bill_amt_2`, `bill_amt_3`, `bill_amt_4`, `bill_amt_5`, `bill_amt_6` have correlation each others from numerical columns
"""
st.markdown(markdown_text)
st.markdown("<h2><b>Checking Distribution Data</b></h2>",
unsafe_allow_html=True)
# Columns to plot
cols_num = ['limit_balance', 'age', 'bill_amt_1',
'bill_amt_2', 'bill_amt_3', 'bill_amt_4', 'bill_amt_5', 'bill_amt_6',
'pay_amt_1', 'pay_amt_2', 'pay_amt_3', 'pay_amt_4', 'pay_amt_5',
'pay_amt_6']
# creating subplots for histogram
fig, axes = plt.subplots(5, 4, figsize=(18, 15))
# Flatten axes array
axes = axes.flatten()
# p;ots for each column
for i, col in enumerate(cols_num):
# membuat histogram dengan kernel density estimate
sns.histplot(data[col], ax=axes[i], kde=True)
axes[i].set_title(f'Distribution {col}')
axes[i].set_xlabel(col)
axes[i].set_ylabel('Frequency')
# hapus figure
for j in range(len(cols_num), len(axes)):
axes[j].remove()
# display
plt.tight_layout()
st.set_option('deprecation.showPyplotGlobalUse', False)
st.pyplot()
st.write("") # Add a blank line
st.write("**Explanation**:")
st.write('Checking the distribution data from non categorical columns, we can say the data is positive skewed')
st.write()