|
|
|
import streamlit as st
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
def app():
|
|
st.title('Exploratory Data Analysis')
|
|
|
|
df = pd.read_csv('eda_data.csv')
|
|
st.subheader('Dataset Preview')
|
|
st.write(df)
|
|
|
|
st.subheader('Data Analysis Questions')
|
|
st.write('How is the percentage of default payment as education level increases?')
|
|
vis_1(df)
|
|
|
|
st.write('How is the contribution of each gender to default payment?')
|
|
vis_2(df)
|
|
|
|
st.write('Which one got more into default payment, customers with limit balance above or below average?')
|
|
vis_3(df)
|
|
|
|
st.write('How does the average of default payment changes as the total late payment rises?')
|
|
vis_4(df)
|
|
|
|
st.write('How is the contribution of each marital status to default payment?')
|
|
vis_5(df)
|
|
|
|
|
|
def vis_1(df):
|
|
|
|
df_eda_1 = df.copy()
|
|
|
|
|
|
df_eda_1_grouped_1 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].sum()).reset_index()
|
|
|
|
|
|
df_eda_1_grouped_2 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].count()).reset_index()
|
|
df_eda_1_grouped_2.rename(columns={'default_payment_next_month' : 'amount'}, inplace=True)
|
|
|
|
|
|
dp_percentage_data = []
|
|
|
|
for i in range(len(df_eda_1_grouped_1)):
|
|
dp_percentage = df_eda_1_grouped_1['default_payment_next_month'][i] / df_eda_1_grouped_2['amount'][i]
|
|
dp_percentage *= 100
|
|
dp_percentage = round(dp_percentage, 2)
|
|
dp_percentage_data.append(dp_percentage)
|
|
|
|
|
|
df_eda_1_final = pd.DataFrame({
|
|
'education_level' : df_eda_1_grouped_1['education_level'],
|
|
'default_payment_percentage' : dp_percentage_data
|
|
})
|
|
|
|
|
|
plt.title('Default Payment Percentage for each Education Level')
|
|
|
|
|
|
plt.bar(data=df_eda_1_final, x='education_level', height='default_payment_percentage')
|
|
|
|
|
|
ax = plt.gca()
|
|
ax.set_ylim([0, 100])
|
|
|
|
|
|
ax.set_xlabel('education level')
|
|
ax.set_ylabel('default payment percentage')
|
|
|
|
|
|
ax.set_xticks([1, 2, 3, 4])
|
|
ax.set_xticklabels(['graduate_school', 'university', 'highschool', 'others'])
|
|
|
|
|
|
rects = ax.patches
|
|
for rect in rects:
|
|
height = rect.get_height()
|
|
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
|
|
f'{height}%',
|
|
ha='center', va='bottom')
|
|
|
|
|
|
st.pyplot(plt.gcf())
|
|
plt.clf()
|
|
|
|
def vis_2(df):
|
|
|
|
df_eda_2 = df.copy()
|
|
|
|
|
|
df_eda_2_grouped = pd.DataFrame(df_eda_2.groupby(by='sex')['default_payment_next_month'].sum()).reset_index()
|
|
|
|
|
|
plt.title('Gender Contribution to Default Payment')
|
|
|
|
|
|
plt.pie(data=df_eda_2_grouped, x='default_payment_next_month', labels=['male', 'female'], autopct='%1.1f%%')
|
|
|
|
|
|
st.pyplot(plt.gcf())
|
|
plt.clf()
|
|
|
|
def vis_3(df):
|
|
|
|
df_eda_3 = df.copy()
|
|
|
|
|
|
avg_lim = df_eda_3['limit_balance'].mean()
|
|
|
|
|
|
lim_group = []
|
|
for lim in df_eda_3['limit_balance']:
|
|
if lim < avg_lim:
|
|
lim_group.append('below_average')
|
|
else:
|
|
lim_group.append('above_average')
|
|
|
|
|
|
df_eda_3['limit_group'] = lim_group
|
|
|
|
|
|
df_eda_3_grouped = pd.DataFrame(df_eda_3.groupby(by='limit_group')['default_payment_next_month'].sum()).reset_index()
|
|
|
|
|
|
plt.title('Default Payment Amount Categorized by Limit Group')
|
|
|
|
|
|
barh = plt.barh(data=df_eda_3_grouped, y='limit_group', width='default_payment_next_month', label='default_payment_next_month')
|
|
|
|
|
|
ax = plt.gca()
|
|
ax.set_xlabel('default payment amount')
|
|
ax.set_ylabel('limit balance group')
|
|
|
|
|
|
ax.set_xlim([0, 550])
|
|
|
|
|
|
ax = plt.gca()
|
|
ax.bar_label(barh)
|
|
|
|
|
|
st.pyplot(plt.gcf())
|
|
plt.clf()
|
|
|
|
def vis_4(df):
|
|
|
|
df_eda_4 = df.copy()
|
|
|
|
|
|
total_late_payment = pd.Series()
|
|
for i in range(1, 7):
|
|
total_late_payment = total_late_payment.add(df_eda_4[f'pay_{i}'], fill_value=0)
|
|
|
|
|
|
df_eda_4['total_late_payment'] = total_late_payment
|
|
|
|
|
|
df_eda_4_grouped = pd.DataFrame(df_eda_4.groupby(by='total_late_payment')['default_payment_next_month'].mean()).reset_index()
|
|
|
|
|
|
plt.title('The Effect of Late Payment to Default Payment')
|
|
|
|
|
|
plt.plot(df_eda_4_grouped['total_late_payment'], df_eda_4_grouped['default_payment_next_month'])
|
|
|
|
|
|
ax = plt.gca()
|
|
ax.set_xlabel('total late payment (month)')
|
|
ax.set_ylabel('average default payment')
|
|
|
|
|
|
st.pyplot(plt.gcf())
|
|
plt.clf()
|
|
|
|
def vis_5(df):
|
|
|
|
df_eda_5 = df.copy()
|
|
|
|
|
|
df_eda_5_grouped = pd.DataFrame(df_eda_5.groupby(by='marital_status')['default_payment_next_month'].sum()).reset_index()
|
|
|
|
|
|
plt.title('Contribution to Default Payment by Marital Status ')
|
|
|
|
|
|
plt.pie(data=df_eda_5_grouped, x='default_payment_next_month', labels=['others', 'married', 'single', 'divorced'], autopct='%1.1f%%')
|
|
|
|
|
|
st.pyplot(plt.gcf())
|
|
plt.clf()
|
|
|