Spaces:
Sleeping
Sleeping
# import libraries | |
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
def app(): | |
st.title('Exploratory Data Analysis') | |
df = pd.read_csv('eda_data.csv') | |
st.subheader('Dataset Preview') | |
st.write(df) | |
st.subheader('Data Analysis Questions') | |
st.write('How is the percentage of default payment as education level increases?') | |
vis_1(df) | |
st.write('How is the contribution of each gender to default payment?') | |
vis_2(df) | |
st.write('Which one got more into default payment, customers with limit balance above or below average?') | |
vis_3(df) | |
st.write('How does the average of default payment changes as the total late payment rises?') | |
vis_4(df) | |
st.write('How is the contribution of each marital status to default payment?') | |
vis_5(df) | |
def vis_1(df): | |
# make a copy of df | |
df_eda_1 = df.copy() | |
# get default payment sum of each education level | |
df_eda_1_grouped_1 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].sum()).reset_index() | |
# get row count of each education level | |
df_eda_1_grouped_2 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].count()).reset_index() | |
df_eda_1_grouped_2.rename(columns={'default_payment_next_month' : 'amount'}, inplace=True) | |
# get default payment percentage of each educatin level | |
dp_percentage_data = [] | |
for i in range(len(df_eda_1_grouped_1)): | |
dp_percentage = df_eda_1_grouped_1['default_payment_next_month'][i] / df_eda_1_grouped_2['amount'][i] | |
dp_percentage *= 100 | |
dp_percentage = round(dp_percentage, 2) | |
dp_percentage_data.append(dp_percentage) | |
# create dataframe with education level and default payment percentage data | |
df_eda_1_final = pd.DataFrame({ | |
'education_level' : df_eda_1_grouped_1['education_level'], | |
'default_payment_percentage' : dp_percentage_data | |
}) | |
# set plot's title | |
plt.title('Default Payment Percentage for each Education Level') | |
# define plot | |
plt.bar(data=df_eda_1_final, x='education_level', height='default_payment_percentage') | |
# set y limit | |
ax = plt.gca() | |
ax.set_ylim([0, 100]) | |
# add axis label | |
ax.set_xlabel('education level') | |
ax.set_ylabel('default payment percentage') | |
# edit x ticks and their labels | |
ax.set_xticks([1, 2, 3, 4]) | |
ax.set_xticklabels(['graduate_school', 'university', 'highschool', 'others']) | |
# add bar label | |
rects = ax.patches | |
for rect in rects: | |
height = rect.get_height() | |
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height, | |
f'{height}%', | |
ha='center', va='bottom') | |
# show plot | |
st.pyplot(plt.gcf()) | |
plt.clf() | |
def vis_2(df): | |
# make a copy of df | |
df_eda_2 = df.copy() | |
# group df by sex and sum their default_payment | |
df_eda_2_grouped = pd.DataFrame(df_eda_2.groupby(by='sex')['default_payment_next_month'].sum()).reset_index() | |
# set plot's title | |
plt.title('Gender Contribution to Default Payment') | |
# define plot | |
plt.pie(data=df_eda_2_grouped, x='default_payment_next_month', labels=['male', 'female'], autopct='%1.1f%%') | |
# show plot | |
st.pyplot(plt.gcf()) | |
plt.clf() | |
def vis_3(df): | |
# make a copy of df | |
df_eda_3 = df.copy() | |
# get the average limit_balance | |
avg_lim = df_eda_3['limit_balance'].mean() | |
# get the limit group data, below or above average | |
lim_group = [] | |
for lim in df_eda_3['limit_balance']: | |
if lim < avg_lim: | |
lim_group.append('below_average') | |
else: | |
lim_group.append('above_average') | |
# add column | |
df_eda_3['limit_group'] = lim_group | |
# group df by limit group and sum their default payment count | |
df_eda_3_grouped = pd.DataFrame(df_eda_3.groupby(by='limit_group')['default_payment_next_month'].sum()).reset_index() | |
# set plot's title | |
plt.title('Default Payment Amount Categorized by Limit Group') | |
# define plot | |
barh = plt.barh(data=df_eda_3_grouped, y='limit_group', width='default_payment_next_month', label='default_payment_next_month') | |
# add axis label | |
ax = plt.gca() | |
ax.set_xlabel('default payment amount') | |
ax.set_ylabel('limit balance group') | |
# set x limit | |
ax.set_xlim([0, 550]) | |
# add label | |
ax = plt.gca() | |
ax.bar_label(barh) | |
# show plot | |
st.pyplot(plt.gcf()) | |
plt.clf() | |
def vis_4(df): | |
# make a copy of df | |
df_eda_4 = df.copy() | |
# get total late payment | |
total_late_payment = pd.Series() | |
for i in range(1, 7): | |
total_late_payment = total_late_payment.add(df_eda_4[f'pay_{i}'], fill_value=0) | |
# add column to df | |
df_eda_4['total_late_payment'] = total_late_payment | |
# group df by total late payment and sum default payment amount | |
df_eda_4_grouped = pd.DataFrame(df_eda_4.groupby(by='total_late_payment')['default_payment_next_month'].mean()).reset_index() | |
# set plot's title | |
plt.title('The Effect of Late Payment to Default Payment') | |
# define plot | |
plt.plot(df_eda_4_grouped['total_late_payment'], df_eda_4_grouped['default_payment_next_month']) | |
# add axis label | |
ax = plt.gca() | |
ax.set_xlabel('total late payment (month)') | |
ax.set_ylabel('average default payment') | |
# show plot | |
st.pyplot(plt.gcf()) | |
plt.clf() | |
def vis_5(df): | |
# make a copy of df | |
df_eda_5 = df.copy() | |
# group df by marital status and sum their default_payment | |
df_eda_5_grouped = pd.DataFrame(df_eda_5.groupby(by='marital_status')['default_payment_next_month'].sum()).reset_index() | |
# set plot's title | |
plt.title('Contribution to Default Payment by Marital Status ') | |
# define plot | |
plt.pie(data=df_eda_5_grouped, x='default_payment_next_month', labels=['others', 'married', 'single', 'divorced'], autopct='%1.1f%%') | |
# show plot | |
st.pyplot(plt.gcf()) | |
plt.clf() | |